diff --git a/onnxruntime/python/tools/transformers/notebooks/PyTorch_Bert-Squad_OnnxRuntime_CPU.ipynb b/onnxruntime/python/tools/transformers/notebooks/PyTorch_Bert-Squad_OnnxRuntime_CPU.ipynb
index 435fd8a3a2..1cb36fab0b 100644
--- a/onnxruntime/python/tools/transformers/notebooks/PyTorch_Bert-Squad_OnnxRuntime_CPU.ipynb
+++ b/onnxruntime/python/tools/transformers/notebooks/PyTorch_Bert-Squad_OnnxRuntime_CPU.ipynb
@@ -47,76 +47,23 @@
"cell_type": "code",
"execution_count": 1,
"metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Looking in links: https://download.pytorch.org/whl/torch_stable.html\n",
- "Requirement already up-to-date: torch==1.6.0+cpu in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (1.6.0+cpu)\n",
- "Requirement already up-to-date: torchvision==0.7.0+cpu in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (0.7.0+cpu)\n",
- "Requirement already satisfied, skipping upgrade: numpy in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from torch==1.6.0+cpu) (1.18.1)\n",
- "Requirement already satisfied, skipping upgrade: future in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from torch==1.6.0+cpu) (0.18.2)\n",
- "Requirement already satisfied, skipping upgrade: pillow>=4.1.1 in c:\\users\\tianl\\appdata\\roaming\\python\\python36\\site-packages (from torchvision==0.7.0+cpu) (7.0.0)\n",
- "Requirement already up-to-date: onnxruntime==1.4.0 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (1.4.0)\n",
- "Requirement already satisfied, skipping upgrade: numpy>=1.16.6 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from onnxruntime==1.4.0) (1.18.1)\n",
- "Requirement already satisfied, skipping upgrade: protobuf in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from onnxruntime==1.4.0) (3.11.3)\n",
- "Requirement already satisfied, skipping upgrade: six>=1.9 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from protobuf->onnxruntime==1.4.0) (1.14.0)\n",
- "Requirement already satisfied, skipping upgrade: setuptools in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from protobuf->onnxruntime==1.4.0) (45.2.0.post20200210)\n",
- "Requirement already up-to-date: onnxruntime-tools in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (1.4.1)\n",
- "Requirement already satisfied, skipping upgrade: py-cpuinfo in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from onnxruntime-tools) (5.0.0)\n",
- "Requirement already satisfied, skipping upgrade: packaging in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from onnxruntime-tools) (20.1)\n",
- "Requirement already satisfied, skipping upgrade: coloredlogs in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from onnxruntime-tools) (14.0)\n",
- "Requirement already satisfied, skipping upgrade: py3nvml in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from onnxruntime-tools) (0.2.5)\n",
- "Requirement already satisfied, skipping upgrade: numpy in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from onnxruntime-tools) (1.18.1)\n",
- "Requirement already satisfied, skipping upgrade: psutil in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from onnxruntime-tools) (5.7.0)\n",
- "Requirement already satisfied, skipping upgrade: onnx in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from onnxruntime-tools) (1.7.0)\n",
- "Requirement already satisfied, skipping upgrade: pyparsing>=2.0.2 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from packaging->onnxruntime-tools) (2.4.6)\n",
- "Requirement already satisfied, skipping upgrade: six in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from packaging->onnxruntime-tools) (1.14.0)\n",
- "Requirement already satisfied, skipping upgrade: humanfriendly>=7.1 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from coloredlogs->onnxruntime-tools) (8.1)\n",
- "Requirement already satisfied, skipping upgrade: xmltodict in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from py3nvml->onnxruntime-tools) (0.12.0)\n",
- "Requirement already satisfied, skipping upgrade: protobuf in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from onnx->onnxruntime-tools) (3.11.3)\n",
- "Requirement already satisfied, skipping upgrade: typing-extensions>=3.6.2.1 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from onnx->onnxruntime-tools) (3.7.4.1)\n",
- "Requirement already satisfied, skipping upgrade: pyreadline; sys_platform == \"win32\" in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from humanfriendly>=7.1->coloredlogs->onnxruntime-tools) (2.1)\n",
- "Requirement already satisfied, skipping upgrade: setuptools in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from protobuf->onnx->onnxruntime-tools) (45.2.0.post20200210)\n",
- "Requirement already satisfied: transformers==3.0.2 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (3.0.2)\n",
- "Requirement already satisfied: packaging in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from transformers==3.0.2) (20.1)\n",
- "Requirement already satisfied: sentencepiece!=0.1.92 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from transformers==3.0.2) (0.1.85)\n",
- "Requirement already satisfied: sacremoses in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from transformers==3.0.2) (0.0.38)\n",
- "Requirement already satisfied: dataclasses; python_version < \"3.7\" in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from transformers==3.0.2) (0.7)\n",
- "Requirement already satisfied: requests in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from transformers==3.0.2) (2.23.0)\n",
- "Requirement already satisfied: tokenizers==0.8.1.rc1 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from transformers==3.0.2) (0.8.1rc1)\n",
- "Requirement already satisfied: tqdm>=4.27 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from transformers==3.0.2) (4.43.0)\n",
- "Requirement already satisfied: regex!=2019.12.17 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from transformers==3.0.2) (2020.2.20)\n",
- "Requirement already satisfied: filelock in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from transformers==3.0.2) (3.0.12)\n",
- "Requirement already satisfied: numpy in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from transformers==3.0.2) (1.18.1)\n",
- "Requirement already satisfied: pyparsing>=2.0.2 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from packaging->transformers==3.0.2) (2.4.6)\n",
- "Requirement already satisfied: six in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from packaging->transformers==3.0.2) (1.14.0)\n",
- "Requirement already satisfied: click in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from sacremoses->transformers==3.0.2) (7.0)\n",
- "Requirement already satisfied: joblib in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from sacremoses->transformers==3.0.2) (0.14.1)\n",
- "Requirement already satisfied: certifi>=2017.4.17 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from requests->transformers==3.0.2) (2020.4.5.1)\n",
- "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from requests->transformers==3.0.2) (1.25.8)\n",
- "Requirement already satisfied: idna<3,>=2.5 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from requests->transformers==3.0.2) (2.9)\n",
- "Requirement already satisfied: chardet<4,>=3.0.2 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from requests->transformers==3.0.2) (3.0.4)\n",
- "Requirement already satisfied: wget in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (3.2)\n",
- "Requirement already satisfied: netron in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (3.9.6)\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
- "# Install PyTorch 1.6.0 and OnnxRuntime 1.4.0 for CPU-only.\n",
"import sys\n",
- "if sys.platform == 'darwin': # Mac\n",
- " !{sys.executable} -m pip install --upgrade torch torchvision\n",
- "else:\n",
- " !{sys.executable} -m pip install --upgrade torch==1.6.0+cpu torchvision==0.7.0+cpu -f https://download.pytorch.org/whl/torch_stable.html\n",
- "!{sys.executable} -m pip install --upgrade onnxruntime==1.4.0\n",
- "!{sys.executable} -m pip install --upgrade onnxconverter_common\n",
- "!{sys.executable} -m pip install --upgrade onnxruntime-tools\n",
"\n",
- "# Install other packages used in this notebook.\n",
- "!{sys.executable} -m pip install transformers==3.0.2\n",
- "!{sys.executable} -m pip install wget netron"
+ "run_install = False # Only need install once\n",
+ "if run_install:\n",
+ " if sys.platform in ['linux', 'win32']: # Linux or Windows\n",
+ " !{sys.executable} -m pip install --upgrade torch torchvision torchaudio\n",
+ " else: # Mac\n",
+ " !{sys.executable} -m pip install torch==1.9.0+cpu torchvision==0.10.0+cpu torchaudio==0.9.0 -f https://download.pytorch.org/whl/torch_stable.html\n",
+ "\n",
+ " !{sys.executable} -m pip install onnxruntime==1.8.1 onnx==1.9.0 onnxconverter_common==1.8.1\n",
+ "\n",
+ " # Install other packages used in this notebook.\n",
+ " !{sys.executable} -m pip install transformers==4.8.2\n",
+ " !{sys.executable} -m pip install psutil pytz pandas py-cpuinfo py3nvml\n",
+ " !{sys.executable} -m pip install wget netron"
]
},
{
@@ -196,14 +143,14 @@
"name": "stderr",
"output_type": "stream",
"text": [
- "Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForQuestionAnswering: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']\n",
- "- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).\n",
+ "Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForQuestionAnswering: ['cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']\n",
+ "- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
"- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
"Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']\n",
"You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
- "100%|██████████████████████████████████████████████████████████████████████████████████| 48/48 [00:03<00:00, 12.87it/s]\n",
- "convert squad examples to features: 100%|███████████████████████████████████████████| 100/100 [00:00<00:00, 131.41it/s]\n",
- "add example index and unique id: 100%|████████████████████████████████████████████| 100/100 [00:00<00:00, 96776.74it/s]\n"
+ "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 48/48 [00:03<00:00, 12.15it/s]\n",
+ "convert squad examples to features: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 135.87it/s]\n",
+ "add example index and unique id: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 100031.10it/s]\n"
]
}
],
@@ -252,6 +199,14 @@
"execution_count": 5,
"metadata": {},
"outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "d:\\git\\transformers\\src\\transformers\\modeling_utils.py:2074: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
+ " input_tensor.shape[chunk_dim] == tensor_shape for input_tensor in input_tensors\n"
+ ]
+ },
{
"name": "stdout",
"output_type": "stream",
@@ -319,7 +274,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "PyTorch cpu Inference time = 144.73 ms\n"
+ "PyTorch cpu Inference time = 119.80 ms\n"
]
}
],
@@ -348,45 +303,26 @@
"source": [
"## 4. Inference ONNX Model with ONNX Runtime ##\n",
"\n",
- "### OpenMP Environment Variable\n",
- "\n",
- "OpenMP environment variables are very important for CPU inference of Bert model. It has large performance impact on Bert model so you might need set it carefully according to [Performance Test Tool](#Performance-Test-Tool) result in later part of this notebook.\n",
- "\n",
- "Setting environment variables shall be done before importing onnxruntime. Otherwise, they might not take effect."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 7,
- "metadata": {},
- "outputs": [],
- "source": [
- "import psutil\n",
- "\n",
- "# You may change the settings in this cell according to Performance Test Tool result.\n",
- "os.environ[\"OMP_NUM_THREADS\"] = str(psutil.cpu_count(logical=True))\n",
- "os.environ[\"OMP_WAIT_POLICY\"] = 'ACTIVE'"
+ "For Onnx Runtime 1.6.0 or older, OpenMP environment variables are very important for CPU inference of Bert model. Since 1.7.0, the official package is not built with OpenMP."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "Now we are ready to inference the model with ONNX Runtime. Here we can see that OnnxRuntime has better performance than PyTorch. \n",
- "\n",
- "It is better to use standalone python script like [Performance Test tool](#Performance-Test-tool) to get accurate performance results."
+ "Now we inference the model with ONNX Runtime. Here we can see that OnnxRuntime has better performance than PyTorch. "
]
},
{
"cell_type": "code",
- "execution_count": 8,
+ "execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "OnnxRuntime cpu Inference time = 88.55 ms\n"
+ "OnnxRuntime cpu Inference time = 72.46 ms\n"
]
}
],
@@ -394,19 +330,15 @@
"import onnxruntime\n",
"import numpy\n",
"\n",
- "# Print warning if user uses onnxruntime-gpu instead of onnxruntime package.\n",
- "if 'CUDAExecutionProvider' in onnxruntime.get_available_providers():\n",
- " print(\"warning: onnxruntime-gpu is not built with OpenMP. You might try onnxruntime package to test CPU inference.\")\n",
- "\n",
"sess_options = onnxruntime.SessionOptions()\n",
"\n",
"# Optional: store the optimized graph and view it using Netron to verify that model is fully optimized.\n",
"# Note that this will increase session creation time, so it is for debugging only.\n",
"sess_options.optimized_model_filepath = os.path.join(output_dir, \"optimized_model_cpu.onnx\")\n",
"\n",
- "# For OnnxRuntime 1.2.0, you might need set intra_op_num_threads to 1 to enable OpenMP\n",
- "# sess_options.intra_op_num_threads=1\n",
- "# For OnnxRuntime 1.3.0 or later, it is recommended to use the default setting so you need not set it.\n",
+ "# For OnnxRuntime 1.7.0 or later, you can set intra_op_num_threads to set thread number like\n",
+ "# sess_options.intra_op_num_threads=4\n",
+ "# Here we use the default value which is a good choice in most cases.\n",
"\n",
"# Specify providers when you use onnxruntime-gpu for CPU inference.\n",
"session = onnxruntime.InferenceSession(export_model_path, sess_options, providers=['CPUExecutionProvider'])\n",
@@ -427,7 +359,7 @@
},
{
"cell_type": "code",
- "execution_count": 9,
+ "execution_count": 8,
"metadata": {},
"outputs": [
{
@@ -474,17 +406,17 @@
"\n",
"Example Usage:\n",
"```\n",
- "from onnxruntime_tools import optimizer\n",
+ "from onnxruntime.transformers import optimizer\n",
"optimized_model = optimizer.optimize_model(export_model_path, model_type='bert', num_heads=12, hidden_size=768)\n",
"optimized_model.save_model_to_file(optimized_model_path)\n",
"```\n",
"\n",
- "You can also use optimizer_cli like the following:"
+ "You can also use command line like the following:"
]
},
{
"cell_type": "code",
- "execution_count": 10,
+ "execution_count": 9,
"metadata": {},
"outputs": [
{
@@ -493,15 +425,17 @@
"text": [
" apply: Fused LayerNormalization count: 25\n",
" apply: Fused Gelu count: 12\n",
- " apply: Fused SkipLayerNormalization count: 25\n",
+ "adjust_reshape_and_expand: Removed Reshape and Expand count: 0\n",
+ " apply: Fused SkipLayerNormalization count: 24\n",
" apply: Fused Attention count: 12\n",
" prune_graph: Graph pruned: 0 inputs, 0 outputs and 5 nodes are removed\n",
" apply: Fused EmbedLayerNormalization(with mask) count: 1\n",
- " prune_graph: Graph pruned: 0 inputs, 0 outputs and 12 nodes are removed\n",
+ " prune_graph: Graph pruned: 0 inputs, 0 outputs and 3 nodes are removed\n",
" prune_graph: Graph pruned: 0 inputs, 0 outputs and 0 nodes are removed\n",
" apply: Fused BiasGelu count: 12\n",
" apply: Fused SkipLayerNormalization(add bias) count: 24\n",
" optimize: opset verion: 11\n",
+ " save_model_to_file: Sort graphs in topological order\n",
" save_model_to_file: Output model to ..\\onnx_models\\bert-base-cased-squad_opt_cpu.onnx\n",
"get_fused_operator_statistics: Optimized operators:{'EmbedLayerNormalization': 1, 'Attention': 12, 'Gelu': 0, 'FastGelu': 0, 'BiasGelu': 12, 'LayerNormalization': 0, 'SkipLayerNormalization': 24}\n",
" main: The model has been fully optimized.\n"
@@ -511,7 +445,7 @@
"source": [
"optimized_model_path = os.path.join(output_dir, 'bert-base-cased-squad_opt_cpu.onnx')\n",
"\n",
- "!{sys.executable} -m onnxruntime_tools.optimizer_cli --input $export_model_path --output $optimized_model_path --model_type bert --num_heads 12 --hidden_size 768"
+ "!{sys.executable} -m onnxruntime.transformers.optimizer --input $export_model_path --output $optimized_model_path --model_type bert --num_heads 12 --hidden_size 768"
]
},
{
@@ -527,7 +461,7 @@
},
{
"cell_type": "code",
- "execution_count": 11,
+ "execution_count": 10,
"metadata": {},
"outputs": [
{
@@ -561,7 +495,7 @@
},
{
"cell_type": "code",
- "execution_count": 12,
+ "execution_count": 11,
"metadata": {},
"outputs": [
{
@@ -569,13 +503,13 @@
"output_type": "stream",
"text": [
"100% passed for 100 random inputs given thresholds (rtol=0.001, atol=0.0001).\n",
- "maximum absolute difference=5.930662155151367e-06\n",
- "maximum relative difference=0.021568937227129936\n"
+ "maximum absolute difference=4.604458808898926e-06\n",
+ "maximum relative difference=0.006278202868998051\n"
]
}
],
"source": [
- "!{sys.executable} -m onnxruntime_tools.transformers.compare_bert_results --baseline_model $export_model_path --optimized_model $optimized_model_path --batch_size 1 --sequence_length 128 --samples 100"
+ "!{sys.executable} -m onnxruntime.transformers.compare_bert_results --baseline_model $export_model_path --optimized_model $optimized_model_path --batch_size 1 --sequence_length 128 --samples 100"
]
},
{
@@ -591,45 +525,45 @@
},
{
"cell_type": "code",
- "execution_count": 13,
+ "execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=0,OMP_NUM_THREADS=,OMP_WAIT_POLICY=,batch_size=1,sequence_length=128,test_cases=100,test_times=1,contiguous=None,use_gpu=False,warmup=True\n",
- "Average latency = 80.08 ms, Throughput = 12.49 QPS\n",
- "Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=,OMP_WAIT_POLICY=,batch_size=1,sequence_length=128,test_cases=100,test_times=1,contiguous=None,use_gpu=False,warmup=True\n",
- "Average latency = 78.56 ms, Throughput = 12.73 QPS\n",
- "Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=ACTIVE,batch_size=1,sequence_length=128,test_cases=100,test_times=1,contiguous=None,use_gpu=False,warmup=True\n",
- "Average latency = 77.78 ms, Throughput = 12.86 QPS\n",
- "Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=PASSIVE,batch_size=1,sequence_length=128,test_cases=100,test_times=1,contiguous=None,use_gpu=False,warmup=True\n",
- "Average latency = 73.52 ms, Throughput = 13.60 QPS\n",
- "Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=6,OMP_WAIT_POLICY=ACTIVE,batch_size=1,sequence_length=128,test_cases=100,test_times=1,contiguous=None,use_gpu=False,warmup=True\n",
- "Average latency = 95.36 ms, Throughput = 10.49 QPS\n",
- "Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=6,OMP_WAIT_POLICY=PASSIVE,batch_size=1,sequence_length=128,test_cases=100,test_times=1,contiguous=None,use_gpu=False,warmup=True\n",
- "Average latency = 78.49 ms, Throughput = 12.74 QPS\n",
- "Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=12,OMP_NUM_THREADS=1,OMP_WAIT_POLICY=ACTIVE,batch_size=1,sequence_length=128,test_cases=100,test_times=1,contiguous=None,use_gpu=False,warmup=True\n",
- "Average latency = 288.71 ms, Throughput = 3.46 QPS\n",
- "Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=12,OMP_NUM_THREADS=1,OMP_WAIT_POLICY=PASSIVE,batch_size=1,sequence_length=128,test_cases=100,test_times=1,contiguous=None,use_gpu=False,warmup=True\n",
- "Average latency = 288.61 ms, Throughput = 3.46 QPS\n",
- "Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=6,OMP_NUM_THREADS=1,OMP_WAIT_POLICY=ACTIVE,batch_size=1,sequence_length=128,test_cases=100,test_times=1,contiguous=None,use_gpu=False,warmup=True\n",
- "Average latency = 288.97 ms, Throughput = 3.46 QPS\n",
- "Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=6,OMP_NUM_THREADS=1,OMP_WAIT_POLICY=PASSIVE,batch_size=1,sequence_length=128,test_cases=100,test_times=1,contiguous=None,use_gpu=False,warmup=True\n",
- "Average latency = 288.37 ms, Throughput = 3.47 QPS\n",
- "Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=6,OMP_NUM_THREADS=6,OMP_WAIT_POLICY=ACTIVE,batch_size=1,sequence_length=128,test_cases=100,test_times=1,contiguous=None,use_gpu=False,warmup=True\n",
- "Average latency = 95.49 ms, Throughput = 10.47 QPS\n",
- "Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=6,OMP_NUM_THREADS=6,OMP_WAIT_POLICY=PASSIVE,batch_size=1,sequence_length=128,test_cases=100,test_times=1,contiguous=None,use_gpu=False,warmup=True\n",
- "Average latency = 79.17 ms, Throughput = 12.63 QPS\n",
- "test setting TestSetting(batch_size=1, sequence_length=128, test_cases=100, test_times=1, contiguous=None, use_gpu=False, warmup=True, omp_num_threads=None, omp_wait_policy=None, intra_op_num_threads=None, seed=3, verbose=False, inclusive=False, extra_latency=True)\n",
+ "Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=12,batch_size=1,sequence_length=128,test_cases=100,test_times=1,use_gpu=False\n",
+ "Average latency = 54.26 ms, Throughput = 18.43 QPS\n",
+ "Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=11,batch_size=1,sequence_length=128,test_cases=100,test_times=1,use_gpu=False\n",
+ "Average latency = 55.80 ms, Throughput = 17.92 QPS\n",
+ "Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=10,batch_size=1,sequence_length=128,test_cases=100,test_times=1,use_gpu=False\n",
+ "Average latency = 65.31 ms, Throughput = 15.31 QPS\n",
+ "Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=9,batch_size=1,sequence_length=128,test_cases=100,test_times=1,use_gpu=False\n",
+ "Average latency = 57.66 ms, Throughput = 17.34 QPS\n",
+ "Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=8,batch_size=1,sequence_length=128,test_cases=100,test_times=1,use_gpu=False\n",
+ "Average latency = 62.84 ms, Throughput = 15.91 QPS\n",
+ "Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=7,batch_size=1,sequence_length=128,test_cases=100,test_times=1,use_gpu=False\n",
+ "Average latency = 69.29 ms, Throughput = 14.43 QPS\n",
+ "Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=6,batch_size=1,sequence_length=128,test_cases=100,test_times=1,use_gpu=False\n",
+ "Average latency = 56.19 ms, Throughput = 17.80 QPS\n",
+ "Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=5,batch_size=1,sequence_length=128,test_cases=100,test_times=1,use_gpu=False\n",
+ "Average latency = 59.90 ms, Throughput = 16.70 QPS\n",
+ "Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=4,batch_size=1,sequence_length=128,test_cases=100,test_times=1,use_gpu=False\n",
+ "Average latency = 63.72 ms, Throughput = 15.69 QPS\n",
+ "Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=3,batch_size=1,sequence_length=128,test_cases=100,test_times=1,use_gpu=False\n",
+ "Average latency = 82.44 ms, Throughput = 12.13 QPS\n",
+ "Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=2,batch_size=1,sequence_length=128,test_cases=100,test_times=1,use_gpu=False\n",
+ "Average latency = 119.64 ms, Throughput = 8.36 QPS\n",
+ "Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,batch_size=1,sequence_length=128,test_cases=100,test_times=1,use_gpu=False\n",
+ "Average latency = 223.21 ms, Throughput = 4.48 QPS\n",
+ "test setting TestSetting(batch_size=1, sequence_length=128, test_cases=100, test_times=1, use_gpu=False, intra_op_num_threads=None, seed=3, verbose=False)\n",
"Generating 100 samples for batch_size=1 sequence_length=128\n",
- "Test summary is saved to ..\\onnx_models\\perf_results_CPU_B1_S128_20200806-173045.txt\n"
+ "Test summary is saved to ..\\onnx_models\\perf_results_CPU_B1_S128_20210713-144140.txt\n"
]
}
],
"source": [
- "!{sys.executable} -m onnxruntime_tools.transformers.bert_perf_test --model $optimized_model_path --batch_size 1 --sequence_length 128 --samples 100 --test_times 1 --inclusive --all"
+ "!{sys.executable} -m onnxruntime.transformers.bert_perf_test --model $optimized_model_path --batch_size 1 --sequence_length 128 --samples 100 --test_times 1"
]
},
{
@@ -641,14 +575,14 @@
},
{
"cell_type": "code",
- "execution_count": 14,
+ "execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "..\\onnx_models\\perf_results_CPU_B1_S128_20200806-173045.txt\n"
+ "..\\onnx_models\\perf_results_CPU_B1_S128_20210713-144140.txt\n"
]
},
{
@@ -678,155 +612,116 @@
"
Latency_P99 | \n",
" Throughput(QPS) | \n",
" intra_op_num_threads | \n",
- " OMP_NUM_THREADS | \n",
- " OMP_WAIT_POLICY | \n",
- " contiguous | \n",
" \n",
" \n",
" \n",
" \n",
" | 0 | \n",
- " 73.52 | \n",
- " 75.78 | \n",
- " 78.21 | \n",
- " 89.29 | \n",
- " 13.60 | \n",
- " 1 | \n",
+ " 54.26 | \n",
+ " 56.05 | \n",
+ " 60.32 | \n",
+ " 109.21 | \n",
+ " 18.43 | \n",
" 12 | \n",
- " PASSIVE | \n",
- " None | \n",
"
\n",
" \n",
" | 1 | \n",
- " 77.78 | \n",
- " 82.35 | \n",
- " 87.02 | \n",
- " 104.54 | \n",
- " 12.86 | \n",
- " 1 | \n",
- " 12 | \n",
- " ACTIVE | \n",
- " None | \n",
+ " 55.80 | \n",
+ " 56.74 | \n",
+ " 59.67 | \n",
+ " 73.62 | \n",
+ " 17.92 | \n",
+ " 11 | \n",
"
\n",
" \n",
" | 2 | \n",
- " 78.49 | \n",
- " 80.92 | \n",
- " 85.77 | \n",
- " 98.98 | \n",
- " 12.74 | \n",
- " 1 | \n",
+ " 56.19 | \n",
+ " 61.29 | \n",
+ " 71.69 | \n",
+ " 80.15 | \n",
+ " 17.80 | \n",
" 6 | \n",
- " PASSIVE | \n",
- " None | \n",
"
\n",
" \n",
" | 3 | \n",
- " 78.56 | \n",
- " 82.29 | \n",
- " 93.46 | \n",
- " 108.73 | \n",
- " 12.73 | \n",
- " 1 | \n",
- " | \n",
- " | \n",
- " None | \n",
+ " 57.66 | \n",
+ " 58.50 | \n",
+ " 61.96 | \n",
+ " 65.12 | \n",
+ " 17.34 | \n",
+ " 9 | \n",
"
\n",
" \n",
" | 4 | \n",
- " 79.17 | \n",
- " 82.02 | \n",
- " 87.60 | \n",
- " 99.55 | \n",
- " 12.63 | \n",
- " 6 | \n",
- " 6 | \n",
- " PASSIVE | \n",
- " None | \n",
+ " 59.90 | \n",
+ " 59.72 | \n",
+ " 65.16 | \n",
+ " 116.16 | \n",
+ " 16.70 | \n",
+ " 5 | \n",
"
\n",
" \n",
" | 5 | \n",
- " 80.08 | \n",
- " 83.18 | \n",
- " 95.60 | \n",
- " 107.72 | \n",
- " 12.49 | \n",
- " 0 | \n",
- " | \n",
- " | \n",
- " None | \n",
+ " 62.84 | \n",
+ " 67.05 | \n",
+ " 69.07 | \n",
+ " 75.99 | \n",
+ " 15.91 | \n",
+ " 8 | \n",
"
\n",
" \n",
" | 6 | \n",
- " 95.36 | \n",
- " 101.25 | \n",
- " 103.61 | \n",
- " 105.15 | \n",
- " 10.49 | \n",
- " 1 | \n",
- " 6 | \n",
- " ACTIVE | \n",
- " None | \n",
+ " 63.72 | \n",
+ " 64.17 | \n",
+ " 69.44 | \n",
+ " 73.10 | \n",
+ " 15.69 | \n",
+ " 4 | \n",
"
\n",
" \n",
" | 7 | \n",
- " 95.49 | \n",
- " 101.50 | \n",
- " 102.66 | \n",
- " 104.82 | \n",
- " 10.47 | \n",
- " 6 | \n",
- " 6 | \n",
- " ACTIVE | \n",
- " None | \n",
+ " 65.31 | \n",
+ " 65.35 | \n",
+ " 80.70 | \n",
+ " 177.94 | \n",
+ " 15.31 | \n",
+ " 10 | \n",
"
\n",
" \n",
" | 8 | \n",
- " 288.37 | \n",
- " 290.48 | \n",
- " 295.37 | \n",
- " 308.91 | \n",
- " 3.47 | \n",
- " 6 | \n",
- " 1 | \n",
- " PASSIVE | \n",
- " None | \n",
+ " 69.29 | \n",
+ " 69.04 | \n",
+ " 70.68 | \n",
+ " 85.03 | \n",
+ " 14.43 | \n",
+ " 7 | \n",
"
\n",
" \n",
" | 9 | \n",
- " 288.61 | \n",
- " 291.10 | \n",
- " 295.78 | \n",
- " 301.52 | \n",
- " 3.46 | \n",
- " 12 | \n",
- " 1 | \n",
- " PASSIVE | \n",
- " None | \n",
+ " 82.44 | \n",
+ " 83.20 | \n",
+ " 89.64 | \n",
+ " 98.80 | \n",
+ " 12.13 | \n",
+ " 3 | \n",
"
\n",
" \n",
" | 10 | \n",
- " 288.71 | \n",
- " 292.64 | \n",
- " 298.28 | \n",
- " 305.92 | \n",
- " 3.46 | \n",
- " 12 | \n",
- " 1 | \n",
- " ACTIVE | \n",
- " None | \n",
+ " 119.64 | \n",
+ " 119.07 | \n",
+ " 122.62 | \n",
+ " 135.67 | \n",
+ " 8.36 | \n",
+ " 2 | \n",
"
\n",
" \n",
" | 11 | \n",
- " 288.97 | \n",
- " 291.18 | \n",
- " 297.68 | \n",
- " 309.30 | \n",
- " 3.46 | \n",
- " 6 | \n",
+ " 223.21 | \n",
+ " 223.22 | \n",
+ " 226.83 | \n",
+ " 249.08 | \n",
+ " 4.48 | \n",
" 1 | \n",
- " ACTIVE | \n",
- " None | \n",
"
\n",
" \n",
"\n",
@@ -834,35 +729,35 @@
],
"text/plain": [
" Latency(ms) Latency_P75 Latency_P90 Latency_P99 Throughput(QPS) \\\n",
- "0 73.52 75.78 78.21 89.29 13.60 \n",
- "1 77.78 82.35 87.02 104.54 12.86 \n",
- "2 78.49 80.92 85.77 98.98 12.74 \n",
- "3 78.56 82.29 93.46 108.73 12.73 \n",
- "4 79.17 82.02 87.60 99.55 12.63 \n",
- "5 80.08 83.18 95.60 107.72 12.49 \n",
- "6 95.36 101.25 103.61 105.15 10.49 \n",
- "7 95.49 101.50 102.66 104.82 10.47 \n",
- "8 288.37 290.48 295.37 308.91 3.47 \n",
- "9 288.61 291.10 295.78 301.52 3.46 \n",
- "10 288.71 292.64 298.28 305.92 3.46 \n",
- "11 288.97 291.18 297.68 309.30 3.46 \n",
+ "0 54.26 56.05 60.32 109.21 18.43 \n",
+ "1 55.80 56.74 59.67 73.62 17.92 \n",
+ "2 56.19 61.29 71.69 80.15 17.80 \n",
+ "3 57.66 58.50 61.96 65.12 17.34 \n",
+ "4 59.90 59.72 65.16 116.16 16.70 \n",
+ "5 62.84 67.05 69.07 75.99 15.91 \n",
+ "6 63.72 64.17 69.44 73.10 15.69 \n",
+ "7 65.31 65.35 80.70 177.94 15.31 \n",
+ "8 69.29 69.04 70.68 85.03 14.43 \n",
+ "9 82.44 83.20 89.64 98.80 12.13 \n",
+ "10 119.64 119.07 122.62 135.67 8.36 \n",
+ "11 223.21 223.22 226.83 249.08 4.48 \n",
"\n",
- " intra_op_num_threads OMP_NUM_THREADS OMP_WAIT_POLICY contiguous \n",
- "0 1 12 PASSIVE None \n",
- "1 1 12 ACTIVE None \n",
- "2 1 6 PASSIVE None \n",
- "3 1 None \n",
- "4 6 6 PASSIVE None \n",
- "5 0 None \n",
- "6 1 6 ACTIVE None \n",
- "7 6 6 ACTIVE None \n",
- "8 6 1 PASSIVE None \n",
- "9 12 1 PASSIVE None \n",
- "10 12 1 ACTIVE None \n",
- "11 6 1 ACTIVE None "
+ " intra_op_num_threads \n",
+ "0 12 \n",
+ "1 11 \n",
+ "2 6 \n",
+ "3 9 \n",
+ "4 5 \n",
+ "5 8 \n",
+ "6 4 \n",
+ "7 10 \n",
+ "8 7 \n",
+ "9 3 \n",
+ "10 2 \n",
+ "11 1 "
]
},
- "execution_count": 14,
+ "execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
@@ -876,7 +771,7 @@
"print(latest_result_file)\n",
"\n",
"# Remove some columns that have same values for all rows.\n",
- "columns_to_remove = ['model', 'graph_optimization_level', 'batch_size', 'sequence_length', 'test_cases', 'test_times', 'use_gpu', 'warmup']\n",
+ "columns_to_remove = ['model', 'graph_optimization_level', 'batch_size', 'sequence_length', 'test_cases', 'test_times', 'use_gpu']\n",
"# Hide some latency percentile columns to fit screen width.\n",
"columns_to_remove.extend(['Latency_P50', 'Latency_P95'])\n",
"result_data.drop(columns_to_remove, axis=1, inplace=True)\n",
@@ -901,7 +796,7 @@
},
{
"cell_type": "code",
- "execution_count": 15,
+ "execution_count": 14,
"metadata": {},
"outputs": [
{
@@ -910,12 +805,12 @@
"text": [
"{\n",
" \"gpu\": {\n",
- " \"driver_version\": \"442.23\",\n",
+ " \"driver_version\": \"470.14\",\n",
" \"devices\": [\n",
" {\n",
" \"memory_total\": 8589934592,\n",
- " \"memory_available\": 6997721088,\n",
- " \"name\": \"GeForce GTX 1070\"\n",
+ " \"memory_available\": 6782619648,\n",
+ " \"name\": \"NVIDIA GeForce GTX 1070\"\n",
" }\n",
" ]\n",
" },\n",
@@ -925,22 +820,98 @@
" \"logical_cores\": 12,\n",
" \"hz\": \"3.1920 GHz\",\n",
" \"l2_cache\": \"1536 KB\",\n",
- " \"l3_cache\": \"12288 KB\",\n",
+ " \"flags\": [\n",
+ " \"3dnow\",\n",
+ " \"3dnowprefetch\",\n",
+ " \"abm\",\n",
+ " \"acpi\",\n",
+ " \"adx\",\n",
+ " \"aes\",\n",
+ " \"apic\",\n",
+ " \"avx\",\n",
+ " \"avx2\",\n",
+ " \"bmi1\",\n",
+ " \"bmi2\",\n",
+ " \"clflush\",\n",
+ " \"clflushopt\",\n",
+ " \"cmov\",\n",
+ " \"cx16\",\n",
+ " \"cx8\",\n",
+ " \"de\",\n",
+ " \"dtes64\",\n",
+ " \"dts\",\n",
+ " \"erms\",\n",
+ " \"est\",\n",
+ " \"f16c\",\n",
+ " \"fma\",\n",
+ " \"fpu\",\n",
+ " \"fxsr\",\n",
+ " \"hle\",\n",
+ " \"ht\",\n",
+ " \"hypervisor\",\n",
+ " \"ia64\",\n",
+ " \"invpcid\",\n",
+ " \"lahf_lm\",\n",
+ " \"mca\",\n",
+ " \"mce\",\n",
+ " \"mmx\",\n",
+ " \"movbe\",\n",
+ " \"mpx\",\n",
+ " \"msr\",\n",
+ " \"mtrr\",\n",
+ " \"osxsave\",\n",
+ " \"pae\",\n",
+ " \"pat\",\n",
+ " \"pbe\",\n",
+ " \"pcid\",\n",
+ " \"pclmulqdq\",\n",
+ " \"pdcm\",\n",
+ " \"pge\",\n",
+ " \"pni\",\n",
+ " \"popcnt\",\n",
+ " \"pse\",\n",
+ " \"pse36\",\n",
+ " \"rdrnd\",\n",
+ " \"rdseed\",\n",
+ " \"rtm\",\n",
+ " \"sep\",\n",
+ " \"serial\",\n",
+ " \"sgx\",\n",
+ " \"sgx_lc\",\n",
+ " \"smap\",\n",
+ " \"smep\",\n",
+ " \"ss\",\n",
+ " \"sse\",\n",
+ " \"sse2\",\n",
+ " \"sse4_1\",\n",
+ " \"sse4_2\",\n",
+ " \"ssse3\",\n",
+ " \"tm\",\n",
+ " \"tm2\",\n",
+ " \"tsc\",\n",
+ " \"tscdeadline\",\n",
+ " \"vme\",\n",
+ " \"x2apic\",\n",
+ " \"xsave\",\n",
+ " \"xtpr\"\n",
+ " ],\n",
" \"processor\": \"Intel64 Family 6 Model 158 Stepping 10, GenuineIntel\"\n",
" },\n",
" \"memory\": {\n",
- " \"total\": 16971276288,\n",
- " \"available\": 4723568640\n",
+ " \"total\": 16977195008,\n",
+ " \"available\": 6085459968\n",
" },\n",
" \"python\": \"3.6.10.final.0 (64 bit)\",\n",
- " \"os\": \"Windows-10-10.0.19041-SP0\",\n",
+ " \"os\": \"Windows-10-10.0.21390-SP0\",\n",
" \"onnxruntime\": {\n",
- " \"version\": \"1.4.0\",\n",
+ " \"version\": \"1.8.1\",\n",
" \"support_gpu\": false\n",
" },\n",
+ " \"onnxruntime_tools\": null,\n",
" \"pytorch\": {\n",
- " \"version\": \"1.6.0+cpu\",\n",
- " \"support_gpu\": false\n",
+ " \"version\": \"1.9.0+cpu\",\n",
+ " \"support_gpu\": false,\n",
+ " \"cuda\": null\n",
" },\n",
" \"tensorflow\": {\n",
" \"version\": \"2.3.0\",\n",
@@ -954,20 +925,14 @@
"name": "stderr",
"output_type": "stream",
"text": [
- "2020-08-06 17:30:50.400838: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library cudart64_101.dll\n"
+ "2021-07-13 14:41:45.376756: W tensorflow/stream_executor/platform/default/dso_loader.cc:59] Could not load dynamic library 'cudart64_101.dll'; dlerror: cudart64_101.dll not found\n",
+ "2021-07-13 14:41:45.376780: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.\n"
]
}
],
"source": [
- "!{sys.executable} -m onnxruntime_tools.transformers.machine_info --silent"
+ "!{sys.executable} -m onnxruntime.transformers.machine_info --silent"
]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
}
],
"metadata": {
diff --git a/onnxruntime/python/tools/transformers/notebooks/PyTorch_Bert-Squad_OnnxRuntime_GPU.ipynb b/onnxruntime/python/tools/transformers/notebooks/PyTorch_Bert-Squad_OnnxRuntime_GPU.ipynb
index 3b090b8232..1016aef1fe 100644
--- a/onnxruntime/python/tools/transformers/notebooks/PyTorch_Bert-Squad_OnnxRuntime_GPU.ipynb
+++ b/onnxruntime/python/tools/transformers/notebooks/PyTorch_Bert-Squad_OnnxRuntime_GPU.ipynb
@@ -36,17 +36,16 @@
"First, we install [AnaConda](https://www.anaconda.com/distribution/) in a target machine and open an AnaConda prompt window when it is done. Then run the following commands to create a conda environment. This notebook is tested with PyTorch 1.5.0 and OnnxRuntime 1.3.0.\n",
"\n",
"```console\n",
- "conda create -n gpu_env python=3.7\n",
+ "conda create -n gpu_env python=3.6\n",
"conda activate gpu_env\n",
- "conda install pytorch torchvision cudatoolkit=10.1 -c pytorch\n",
"conda install -c anaconda ipykernel\n",
"conda install -c conda-forge ipywidgets\n",
- "python -m ipykernel install --user --name=gpu_env_py37\n",
+ "python -m ipykernel install --user --name=gpu_env\n",
"jupyter notebook\n",
"```\n",
- "Finally, launch Jupyter Notebook and you can choose gpu_env_py37 as kernel to run this notebook.\n",
+ "Finally, launch Jupyter Notebook and you can choose gpu_env as kernel to run this notebook.\n",
"\n",
- "Onnxruntime-gpu need specified version of CUDA and cuDNN. You can find the corresponding version in [requirements](https://github.com/microsoft/onnxruntime/tree/rel-1.3.0#system-requirements). If the version is different from above cudatoolkit version, you have to install them separately, and add their bin directories to PATH environment variable (See [CUDA and cuDNN Path](#CUDA-and-cuDNN-Path) below)."
+ "Onnxruntime-gpu need specified version of CUDA and cuDNN. You can find the Requirements [here]( http://www.onnxruntime.ai/docs/how-to/install.html). Remember to add the directories to PATH environment variable (See [CUDA and cuDNN Path](#CUDA-and-cuDNN-Path) below)."
]
},
{
@@ -56,12 +55,46 @@
"outputs": [],
"source": [
"import sys\n",
- "!{sys.executable} -m pip uninstall --quiet --yes onnxruntime-gpu\n",
- "!{sys.executable} -m pip install --quiet onnxruntime-gpu\n",
- "!{sys.executable} -m pip install --quiet --upgrade transformers\n",
- "!{sys.executable} -m pip install --quiet --upgrade onnxconverter_common\n",
- "!{sys.executable} -m pip install --quiet --upgrade onnxruntime-tools\n",
- "!{sys.executable} -m pip install --quiet wget netron pandas"
+ "\n",
+ "run_install = False # Only need install once\n",
+ "if run_install:\n",
+ " if sys.platform in ['linux', 'win32']: # Linux or Windows\n",
+ " !{sys.executable} -m pip install torch==1.9.0+cu111 torchvision==0.10.0+cu111 torchaudio===0.9.0 -f https://download.pytorch.org/whl/torch_stable.html\n",
+ " else: # Mac\n",
+ " print(\"PyTorch 1.9 MacOS Binaries do not support CUDA, install from source instead\")\n",
+ "\n",
+ " !{sys.executable} -m pip install onnxruntime-gpu==1.8.1 onnx==1.9.0 onnxconverter_common==1.8.1\n",
+ "\n",
+ " # Install other packages used in this notebook.\n",
+ " !{sys.executable} -m pip install transformers==4.8.2\n",
+ " !{sys.executable} -m pip install psutil pytz pandas py-cpuinfo py3nvml coloredlogs wget netron sympy"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "pytorch: 1.9.0+cu111\n",
+ "onnxruntime: 1.8.1\n",
+ "onnx: 1.9.0\n",
+ "transformers: 4.8.2\n"
+ ]
+ }
+ ],
+ "source": [
+ "import torch\n",
+ "import onnx\n",
+ "import onnxruntime\n",
+ "import transformers\n",
+ "print(\"pytorch:\", torch.__version__)\n",
+ "print(\"onnxruntime:\", onnxruntime.__version__)\n",
+ "print(\"onnx:\", onnx.__version__)\n",
+ "print(\"transformers:\", transformers.__version__)"
]
},
{
@@ -80,7 +113,7 @@
},
{
"cell_type": "code",
- "execution_count": 2,
+ "execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
@@ -108,7 +141,7 @@
},
{
"cell_type": "code",
- "execution_count": 3,
+ "execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
@@ -131,12 +164,12 @@
},
{
"cell_type": "code",
- "execution_count": 4,
+ "execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
- "# For fine-tuned large model, the model name is \"bert-large-uncased-whole-word-masking-finetuned-squad\". Here we use bert-base for demo.\n",
- "model_name_or_path = \"bert-base-cased\"\n",
+ "# fine-tuned model from https://huggingface.co/models?search=squad\n",
+ "model_name_or_path = \"bert-large-uncased-whole-word-masking-finetuned-squad\"\n",
"max_seq_length = 128\n",
"doc_stride = 128\n",
"max_query_length = 64"
@@ -151,16 +184,16 @@
},
{
"cell_type": "code",
- "execution_count": 5,
+ "execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
- "100%|██████████| 48/48 [00:04<00:00, 11.28it/s]\n",
- "convert squad examples to features: 100%|██████████| 1000/1000 [00:09<00:00, 102.15it/s]\n",
- "add example index and unique id: 100%|██████████| 1000/1000 [00:00<00:00, 161306.98it/s]\n"
+ "100%|██████████| 48/48 [00:03<00:00, 14.24it/s]\n",
+ "convert squad examples to features: 100%|██████████| 1000/1000 [00:08<00:00, 112.67it/s]\n",
+ "add example index and unique id: 100%|██████████| 1000/1000 [00:00<00:00, 836518.55it/s]\n"
]
}
],
@@ -206,9 +239,17 @@
},
{
"cell_type": "code",
- "execution_count": 6,
+ "execution_count": 7,
"metadata": {},
"outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/disk/conda3/envs/gpu_env/lib/python3.6/site-packages/transformers/modeling_utils.py:1974: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
+ " input_tensor.shape[chunk_dim] == tensor_shape for input_tensor in input_tensors\n"
+ ]
+ },
{
"name": "stdout",
"output_type": "stream",
@@ -270,14 +311,14 @@
},
{
"cell_type": "code",
- "execution_count": 7,
+ "execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "PyTorch cuda Inference time = 16.57 ms\n"
+ "PyTorch cuda Inference time = 16.56 ms\n"
]
}
],
@@ -307,47 +348,7 @@
"## 4. Inference ONNX Model with ONNX Runtime ##\n",
"\n",
"### CUDA and cuDNN Path\n",
- "onnxruntime-gpu has dependency on [CUDA](https://developer.nvidia.com/cuda-downloads) and [cuDNN](https://developer.nvidia.com/cudnn):\n",
- "\n",
- "* [onnxruntime-gpu v1.3.0](https://github.com/microsoft/onnxruntime/tree/rel-1.3.0#system-requirements) requires CUDA Runtime 10.1 and CUDNN 7.6.5.\n",
- "* [onnxruntime-gpu v1.2.0](https://github.com/microsoft/onnxruntime/releases/tag/v1.2.0) requires CUDA Runtime 10.1 and CUDNN 7.6.5.\n",
- "\n",
- "During installing PyTorch 1.5, we installed cudatoolkit 10.1.243 in this conda environment. That shall be good for onnxruntime-gpu 1.3.0 in Jupyter Notebook."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 8,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Change to True when onnxruntime (like onnxruntime-gpu 1.0.0 ~ 1.1.2) cannot be imported.\n",
- "add_cuda_path = False\n",
- "\n",
- "if add_cuda_path:\n",
- " # Add path of CUDA 10.0 and CUDNN 7.6 for onnxruntime-gpu 1.0.0 ~ 1.1.2\n",
- " cuda_dir = 'D:/NVidia/CUDA/v10.1/bin'\n",
- " cudnn_dir = 'D:/NVidia/CUDA/v10.1/bin'\n",
- " if not (os.path.exists(cuda_dir) and os.path.exists(cudnn_dir)):\n",
- " raise ValueError(\"Please specify correct path for CUDA and cuDNN. Otherwise onnxruntime cannot be imported.\")\n",
- " else:\n",
- " if cuda_dir == cudnn_dir:\n",
- " os.environ[\"PATH\"] = cuda_dir + ';' + os.environ[\"PATH\"]\n",
- " else:\n",
- " os.environ[\"PATH\"] = cuda_dir + ';' + cudnn_dir + ';' + os.environ[\"PATH\"]"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### OpenMP Environment Variable\n",
- "\n",
- "OpenMP environment variables are optional for GPU inference of standard Bert model. It has little performance impact on Bert model since most nodes are executed in GPU. \n",
- "\n",
- "You can find the best setting based on [Performance Test Tool](#Performance-Test-Tool) result in later part of this notebook.\n",
- "\n",
- "**Attention: Setting environment variables shall be done before importing onnxruntime**. Otherwise, they might not take effect."
+ "onnxruntime-gpu has dependency on [CUDA](https://developer.nvidia.com/cuda-downloads) and [cuDNN](https://developer.nvidia.com/cudnn). Required CUDA version can be found [here](http://www.onnxruntime.ai/docs/reference/execution-providers/CUDA-ExecutionProvider.html#requirements)\n"
]
},
{
@@ -356,9 +357,21 @@
"metadata": {},
"outputs": [],
"source": [
- "# Optional. You can change them according to Performance Test Tool result.\n",
- "#os.environ[\"OMP_NUM_THREADS\"] = '1'\n",
- "#os.environ[\"OMP_WAIT_POLICY\"] = 'PASSIVE'"
+ "# Change to True when onnxruntime (like onnxruntime-gpu 1.0.0 ~ 1.1.2) cannot be imported.\n",
+ "add_cuda_path = False\n",
+ "\n",
+ "# For Linux, see https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#environment-setup\n",
+ "# Below is example for Windows\n",
+ "if add_cuda_path:\n",
+ " cuda_dir = 'D:/NVidia/CUDA/v11.0/bin'\n",
+ " cudnn_dir = 'D:/NVidia/CUDA/v11.0/bin'\n",
+ " if not (os.path.exists(cuda_dir) and os.path.exists(cudnn_dir)):\n",
+ " raise ValueError(\"Please specify correct path for CUDA and cuDNN. Otherwise onnxruntime cannot be imported.\")\n",
+ " else:\n",
+ " if cuda_dir == cudnn_dir:\n",
+ " os.environ[\"PATH\"] = cuda_dir + ';' + os.environ[\"PATH\"]\n",
+ " else:\n",
+ " os.environ[\"PATH\"] = cuda_dir + ';' + cudnn_dir + ';' + os.environ[\"PATH\"]"
]
},
{
@@ -377,7 +390,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "OnnxRuntime gpu Inference time = 4.43 ms\n"
+ "OnnxRuntime gpu Inference time = 25.28 ms\n"
]
}
],
@@ -403,7 +416,7 @@
"latency = []\n",
"for i in range(total_samples):\n",
" data = dataset[i]\n",
- " # TODO: use IO Binding (see https://github.com/microsoft/onnxruntime/pull/4206) to improve performance.\n",
+ " # TODO: use IO Binding (see https://www.onnxruntime.ai/python/api_summary.html) to improve performance.\n",
" ort_inputs = {\n",
" 'input_ids': data[0].cpu().reshape(1, max_seq_length).numpy(),\n",
" 'input_mask': data[1].cpu().reshape(1, max_seq_length).numpy(),\n",
@@ -436,9 +449,9 @@
"text": [
"***** Verifying correctness *****\n",
"PyTorch and ONNX Runtime output 0 are close: True\n",
- "maximum_diff=9.499490261077881e-07 average_diff=1.4225952327251434e-07\n",
+ "maximum_diff=5.7220458984375e-06 average_diff=1.3103708624839783e-06\n",
"PyTorch and ONNX Runtime output 1 are close: True\n",
- "maximum_diff=6.92903995513916e-07 average_diff=1.2441887520253658e-07\n"
+ "maximum_diff=5.7220458984375e-06 average_diff=1.2257369235157967e-06\n"
]
}
],
@@ -472,13 +485,13 @@
{
"data": {
"text/plain": [
- "{'input_ids': tensor([[ 101, 1293, 1242, 2557, 1127, 1226, 1104, 1103, 3613, 16429,\n",
- " 5235, 136, 102, 3613, 16429, 5988, 170, 107, 1353, 1671,\n",
- " 1992, 1342, 107, 5235, 117, 1107, 1134, 1473, 3683, 3538,\n",
- " 1125, 170, 1476, 118, 1248, 2595, 4086, 1714, 1104, 2965,\n",
- " 15897, 1104, 3613, 16429, 119, 1473, 3683, 3538, 3222, 1149,\n",
- " 2551, 1168, 23759, 1116, 1121, 1506, 1103, 10280, 2231, 1111,\n",
- " 1103, 1714, 16355, 119, 102, 0, 0, 0, 0, 0,\n",
+ "{'input_ids': tensor([[ 101, 2054, 2329, 2694, 2897, 2097, 4287, 1996, 3565, 4605,\n",
+ " 1029, 102, 1999, 1996, 2142, 2983, 1010, 4035, 2557, 1019,\n",
+ " 2444, 1998, 1019, 2444, 2998, 4469, 2097, 4287, 1996, 5049,\n",
+ " 1012, 1996, 4035, 2097, 4287, 2049, 2219, 2329, 2394, 3743,\n",
+ " 1010, 2007, 6754, 10184, 1010, 12270, 10589, 1998, 6857, 8945,\n",
+ " 18505, 2006, 8570, 1012, 102, 0, 0, 0, 0, 0,\n",
+ " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
@@ -488,13 +501,13 @@
" device='cuda:0'),\n",
" 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
" 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
- " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,\n",
+ " 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0]], device='cuda:0'),\n",
- " 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
+ " 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
" 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
- " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,\n",
+ " 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0]], device='cuda:0')}"
@@ -526,8 +539,8 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "Average length 101\n",
- "OnnxRuntime gpu Inference time with actual sequence length = 4.23 ms\n"
+ "Average length 94\n",
+ "OnnxRuntime gpu Inference time with actual sequence length = 21.93 ms\n"
]
}
],
@@ -611,12 +624,12 @@
"\n",
"Example Usage:\n",
"```\n",
- "from onnxruntime_tools import optimizer\n",
+ "from onnxruntime.transformers import optimizer\n",
"optimized_model = optimizer.optimize_model(export_model_path, model_type='bert', num_heads=12, hidden_size=768)\n",
"optimized_model.save_model_to_file(optimized_model_path)\n",
"```\n",
"\n",
- "You can also use optimizer_cli like the following:"
+ "You can also use command line like the following:"
]
},
{
@@ -638,20 +651,21 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "optimize_by_onnxruntime: Save optimized model by onnxruntime to ./onnx/bert-base-cased-squad_opset11_o1_cpu.onnx\n",
- " apply: Fused LayerNormalization count: 25\n",
- " apply: Fused Gelu count: 12\n",
- " apply: Fused SkipLayerNormalization count: 25\n",
- " apply: Fused Attention count: 12\n",
+ " apply: Fused LayerNormalization count: 49\n",
+ " apply: Fused Gelu count: 24\n",
+ "adjust_reshape_and_expand: Removed Reshape and Expand count: 0\n",
+ " apply: Fused SkipLayerNormalization count: 48\n",
+ " apply: Fused Attention count: 24\n",
" prune_graph: Graph pruned: 0 inputs, 0 outputs and 5 nodes are removed\n",
" apply: Fused EmbedLayerNormalization(with mask) count: 1\n",
- " prune_graph: Graph pruned: 0 inputs, 0 outputs and 12 nodes are removed\n",
+ " prune_graph: Graph pruned: 0 inputs, 0 outputs and 3 nodes are removed\n",
" prune_graph: Graph pruned: 0 inputs, 0 outputs and 0 nodes are removed\n",
- " apply: Fused BiasGelu count: 12\n",
- " apply: Fused SkipLayerNormalization(add bias) count: 24\n",
+ " apply: Fused BiasGelu count: 24\n",
+ " apply: Fused SkipLayerNormalization(add bias) count: 48\n",
" optimize: opset verion: 11\n",
+ " save_model_to_file: Sort graphs in topological order\n",
" save_model_to_file: Output model to ./onnx/bert-base-cased-squad_opt_gpu_fp32.onnx\n",
- "get_fused_operator_statistics: Optimized operators:{'EmbedLayerNormalization': 1, 'Attention': 12, 'Gelu': 0, 'FastGelu': 0, 'BiasGelu': 12, 'LayerNormalization': 0, 'SkipLayerNormalization': 24}\n",
+ "get_fused_operator_statistics: Optimized operators:{'EmbedLayerNormalization': 1, 'Attention': 24, 'Gelu': 0, 'FastGelu': 0, 'BiasGelu': 24, 'LayerNormalization': 0, 'SkipLayerNormalization': 48}\n",
" main: The model has been fully optimized.\n"
]
}
@@ -659,7 +673,7 @@
"source": [
"optimized_fp32_model_path = './onnx/bert-base-cased-squad_opt_{}_fp32.onnx'.format('gpu' if use_gpu else 'cpu')\n",
"\n",
- "!python -m onnxruntime_tools.optimizer_cli --input $export_model_path --output $optimized_fp32_model_path"
+ "!{sys.executable} -m onnxruntime.transformers.optimizer --input $export_model_path --output $optimized_fp32_model_path"
]
},
{
@@ -712,32 +726,24 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "test setting TestSetting(batch_size=1, sequence_length=128, test_cases=1000, test_times=1, contiguous=None, use_gpu=True, warmup=True, omp_num_threads=None, omp_wait_policy=None, intra_op_num_threads=None, seed=3, verbose=False, inclusive=False, extra_latency=True)\n",
+ "test setting TestSetting(batch_size=1, sequence_length=128, test_cases=1000, test_times=1, use_gpu=True, intra_op_num_threads=None, seed=3, verbose=False)\n",
"Generating 1000 samples for batch_size=1 sequence_length=128\n",
- "Running test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=0,OMP_NUM_THREADS=,OMP_WAIT_POLICY=,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n",
- "Average latency = 4.92 ms, Throughput = 203.24 QPS\n",
- "Running test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=PASSIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n",
- "Average latency = 4.90 ms, Throughput = 203.88 QPS\n",
- "Running test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=12,OMP_NUM_THREADS=1,OMP_WAIT_POLICY=ACTIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n",
- "Average latency = 5.07 ms, Throughput = 197.16 QPS\n",
- "Running test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=ACTIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n",
- "Average latency = 4.82 ms, Throughput = 207.33 QPS\n",
- "skip duplicated test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=PASSIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n",
- "skip duplicated test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=12,OMP_NUM_THREADS=1,OMP_WAIT_POLICY=ACTIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n",
- "Running test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=12,OMP_NUM_THREADS=1,OMP_WAIT_POLICY=PASSIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n",
- "Average latency = 4.93 ms, Throughput = 202.92 QPS\n",
- "Running test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=12,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=ACTIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n",
- "Average latency = 4.91 ms, Throughput = 203.55 QPS\n",
- "Running test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=12,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=PASSIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n",
- "Average latency = 4.88 ms, Throughput = 204.90 QPS\n",
- "Test summary is saved to onnx/perf_results_GPU_B1_S128_20200617-232134.txt\n"
+ "Running test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=4,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True\n",
+ "Average latency = 23.72 ms, Throughput = 42.15 QPS\n",
+ "Running test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=3,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True\n",
+ "Average latency = 24.24 ms, Throughput = 41.25 QPS\n",
+ "Running test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=2,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True\n",
+ "Average latency = 24.36 ms, Throughput = 41.05 QPS\n",
+ "Running test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True\n",
+ "Average latency = 24.39 ms, Throughput = 41.01 QPS\n",
+ "Test summary is saved to onnx/perf_results_GPU_B1_S128_20210714-001817.txt\n"
]
}
],
"source": [
"GPU_OPTION = '--use_gpu' if use_gpu else ''\n",
"\n",
- "!python -m onnxruntime_tools.transformers.bert_perf_test --model $optimized_fp32_model_path --batch_size 1 --sequence_length 128 --samples 1000 --test_times 1 --inclusive --all $GPU_OPTION"
+ "!{sys.executable} -m onnxruntime.transformers.bert_perf_test --model $optimized_fp32_model_path --batch_size 1 --sequence_length 128 --samples 1000 --test_times 1 $GPU_OPTION"
]
},
{
@@ -756,7 +762,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "Float32 model perf results from ./onnx/perf_results_GPU_B1_S128_20200617-232134.txt\n"
+ "Float32 model perf results from ./onnx/perf_results_GPU_B1_S128_20210714-001817.txt\n"
]
},
{
@@ -788,117 +794,52 @@
" Latency_P99 | \n",
" Throughput(QPS) | \n",
" intra_op_num_threads | \n",
- " OMP_NUM_THREADS | \n",
- " OMP_WAIT_POLICY | \n",
- " contiguous | \n",
- " warmup | \n",
" \n",
" \n",
" \n",
" \n",
" | 0 | \n",
- " 4.82 | \n",
- " 4.53 | \n",
- " 4.57 | \n",
- " 5.15 | \n",
- " 7.25 | \n",
- " 8.75 | \n",
- " 207.33 | \n",
- " 1 | \n",
- " 12 | \n",
- " ACTIVE | \n",
- " None | \n",
- " True | \n",
+ " 23.72 | \n",
+ " 23.72 | \n",
+ " 23.87 | \n",
+ " 23.99 | \n",
+ " 24.11 | \n",
+ " 24.37 | \n",
+ " 42.15 | \n",
+ " 4 | \n",
"
\n",
" \n",
" | 1 | \n",
- " 4.88 | \n",
- " 4.54 | \n",
- " 4.58 | \n",
- " 6.47 | \n",
- " 7.13 | \n",
- " 8.68 | \n",
- " 204.90 | \n",
- " 12 | \n",
- " 12 | \n",
- " PASSIVE | \n",
- " None | \n",
- " True | \n",
+ " 24.24 | \n",
+ " 24.24 | \n",
+ " 24.42 | \n",
+ " 24.60 | \n",
+ " 24.76 | \n",
+ " 25.23 | \n",
+ " 41.25 | \n",
+ " 3 | \n",
"
\n",
" \n",
" | 2 | \n",
- " 4.90 | \n",
- " 4.54 | \n",
- " 4.57 | \n",
- " 6.16 | \n",
- " 7.64 | \n",
- " 8.82 | \n",
- " 203.88 | \n",
- " 1 | \n",
- " 12 | \n",
- " PASSIVE | \n",
- " None | \n",
- " True | \n",
+ " 24.36 | \n",
+ " 24.36 | \n",
+ " 24.47 | \n",
+ " 24.69 | \n",
+ " 25.01 | \n",
+ " 26.52 | \n",
+ " 41.05 | \n",
+ " 2 | \n",
"
\n",
" \n",
" | 3 | \n",
- " 4.91 | \n",
- " 4.55 | \n",
- " 4.59 | \n",
- " 6.70 | \n",
- " 7.43 | \n",
- " 8.78 | \n",
- " 203.55 | \n",
- " 12 | \n",
- " 12 | \n",
- " ACTIVE | \n",
- " None | \n",
- " True | \n",
- "
\n",
- " \n",
- " | 4 | \n",
- " 4.92 | \n",
- " 4.57 | \n",
- " 4.60 | \n",
- " 6.50 | \n",
- " 7.82 | \n",
- " 8.90 | \n",
- " 203.24 | \n",
- " 0 | \n",
- " | \n",
- " | \n",
- " None | \n",
- " True | \n",
- "
\n",
- " \n",
- " | 5 | \n",
- " 4.93 | \n",
- " 4.55 | \n",
- " 4.59 | \n",
- " 6.66 | \n",
- " 7.57 | \n",
- " 8.80 | \n",
- " 202.92 | \n",
- " 12 | \n",
+ " 24.39 | \n",
+ " 24.37 | \n",
+ " 24.47 | \n",
+ " 24.65 | \n",
+ " 24.73 | \n",
+ " 25.12 | \n",
+ " 41.01 | \n",
" 1 | \n",
- " PASSIVE | \n",
- " None | \n",
- " True | \n",
- "
\n",
- " \n",
- " | 6 | \n",
- " 5.07 | \n",
- " 4.56 | \n",
- " 4.61 | \n",
- " 7.19 | \n",
- " 8.11 | \n",
- " 9.01 | \n",
- " 197.16 | \n",
- " 12 | \n",
- " 1 | \n",
- " ACTIVE | \n",
- " None | \n",
- " True | \n",
"
\n",
" \n",
"\n",
@@ -906,31 +847,16 @@
],
"text/plain": [
" Latency(ms) Latency_P50 Latency_P75 Latency_P90 Latency_P95 \\\n",
- "0 4.82 4.53 4.57 5.15 7.25 \n",
- "1 4.88 4.54 4.58 6.47 7.13 \n",
- "2 4.90 4.54 4.57 6.16 7.64 \n",
- "3 4.91 4.55 4.59 6.70 7.43 \n",
- "4 4.92 4.57 4.60 6.50 7.82 \n",
- "5 4.93 4.55 4.59 6.66 7.57 \n",
- "6 5.07 4.56 4.61 7.19 8.11 \n",
+ "0 23.72 23.72 23.87 23.99 24.11 \n",
+ "1 24.24 24.24 24.42 24.60 24.76 \n",
+ "2 24.36 24.36 24.47 24.69 25.01 \n",
+ "3 24.39 24.37 24.47 24.65 24.73 \n",
"\n",
- " Latency_P99 Throughput(QPS) intra_op_num_threads OMP_NUM_THREADS \\\n",
- "0 8.75 207.33 1 12 \n",
- "1 8.68 204.90 12 12 \n",
- "2 8.82 203.88 1 12 \n",
- "3 8.78 203.55 12 12 \n",
- "4 8.90 203.24 0 \n",
- "5 8.80 202.92 12 1 \n",
- "6 9.01 197.16 12 1 \n",
- "\n",
- " OMP_WAIT_POLICY contiguous warmup \n",
- "0 ACTIVE None True \n",
- "1 PASSIVE None True \n",
- "2 PASSIVE None True \n",
- "3 ACTIVE None True \n",
- "4 None True \n",
- "5 PASSIVE None True \n",
- "6 ACTIVE None True "
+ " Latency_P99 Throughput(QPS) intra_op_num_threads \n",
+ "0 24.37 42.15 4 \n",
+ "1 25.23 41.25 3 \n",
+ "2 26.52 41.05 2 \n",
+ "3 25.12 41.01 1 "
]
},
"execution_count": 18,
@@ -943,7 +869,7 @@
"import glob \n",
"import pandas\n",
"latest_result_file = max(glob.glob(\"./onnx/perf_results_GPU_B1_S128_*.txt\"), key=os.path.getmtime)\n",
- "result_data = pandas.read_table(latest_result_file, converters={'OMP_NUM_THREADS': str, 'OMP_WAIT_POLICY':str})\n",
+ "result_data = pandas.read_table(latest_result_file)\n",
"print(\"Float32 model perf results from\", latest_result_file)\n",
"# Remove some columns that have same values for all rows.\n",
"columns_to_remove = ['model', 'graph_optimization_level', 'batch_size', 'sequence_length', 'test_cases', 'test_times', 'use_gpu']\n",
@@ -974,13 +900,13 @@
"output_type": "stream",
"text": [
"100% passed for 100 random inputs given thresholds (rtol=0.01, atol=0.01).\r\n",
- "maximum absolute difference=1.9222497940063477e-06\r\n",
- "maximum relative difference=0.05027933046221733\r\n"
+ "maximum absolute difference=5.316734313964844e-05\r\n",
+ "maximum relative difference=0.00012461667938623577\r\n"
]
}
],
"source": [
- "!python -m onnxruntime_tools.transformers.compare_bert_results --baseline_model $export_model_path --optimized_model $optimized_fp32_model_path --batch_size 1 --sequence_length 128 --samples 100 --rtol 0.01 --atol 0.01 $GPU_OPTION"
+ "!{sys.executable} -m onnxruntime.transformers.compare_bert_results --baseline_model $export_model_path --optimized_model $optimized_fp32_model_path --batch_size 1 --sequence_length 128 --samples 100 --rtol 0.01 --atol 0.01 $GPU_OPTION"
]
},
{
@@ -1003,27 +929,28 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "optimize_by_onnxruntime: Save optimized model by onnxruntime to ./onnx/bert-base-cased-squad_opset11_o1_cpu.onnx\n",
- " apply: Fused LayerNormalization count: 25\n",
- " apply: Fused Gelu count: 12\n",
- " apply: Fused SkipLayerNormalization count: 25\n",
- " apply: Fused Attention count: 12\n",
+ " apply: Fused LayerNormalization count: 49\n",
+ " apply: Fused Gelu count: 24\n",
+ "adjust_reshape_and_expand: Removed Reshape and Expand count: 0\n",
+ " apply: Fused SkipLayerNormalization count: 48\n",
+ " apply: Fused Attention count: 24\n",
" prune_graph: Graph pruned: 0 inputs, 0 outputs and 5 nodes are removed\n",
" apply: Fused EmbedLayerNormalization(with mask) count: 1\n",
- " prune_graph: Graph pruned: 0 inputs, 0 outputs and 12 nodes are removed\n",
+ " prune_graph: Graph pruned: 0 inputs, 0 outputs and 3 nodes are removed\n",
" prune_graph: Graph pruned: 0 inputs, 0 outputs and 0 nodes are removed\n",
- " apply: Fused BiasGelu count: 12\n",
- " apply: Fused SkipLayerNormalization(add bias) count: 24\n",
+ " apply: Fused BiasGelu count: 24\n",
+ " apply: Fused SkipLayerNormalization(add bias) count: 48\n",
" optimize: opset verion: 11\n",
+ " save_model_to_file: Sort graphs in topological order\n",
" save_model_to_file: Output model to ./onnx/bert-base-cased-squad_opt_gpu_fp16.onnx\n",
- "get_fused_operator_statistics: Optimized operators:{'EmbedLayerNormalization': 1, 'Attention': 12, 'Gelu': 0, 'FastGelu': 0, 'BiasGelu': 12, 'LayerNormalization': 0, 'SkipLayerNormalization': 24}\n",
+ "get_fused_operator_statistics: Optimized operators:{'EmbedLayerNormalization': 1, 'Attention': 24, 'Gelu': 0, 'FastGelu': 0, 'BiasGelu': 24, 'LayerNormalization': 0, 'SkipLayerNormalization': 48}\n",
" main: The model has been fully optimized.\n"
]
}
],
"source": [
"optimized_fp16_model_path = './onnx/bert-base-cased-squad_opt_{}_fp16.onnx'.format('gpu' if use_gpu else 'cpu')\n",
- "!python -m onnxruntime_tools.optimizer_cli --input $export_model_path --output $optimized_fp16_model_path --float16"
+ "!{sys.executable} -m onnxruntime.transformers.optimizer --input $export_model_path --output $optimized_fp16_model_path --float16"
]
},
{
@@ -1035,31 +962,23 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "test setting TestSetting(batch_size=1, sequence_length=128, test_cases=1000, test_times=1, contiguous=None, use_gpu=True, warmup=True, omp_num_threads=None, omp_wait_policy=None, intra_op_num_threads=None, seed=3, verbose=False, inclusive=False, extra_latency=True)\n",
+ "test setting TestSetting(batch_size=1, sequence_length=128, test_cases=1000, test_times=1, use_gpu=True, intra_op_num_threads=None, seed=3, verbose=False)\n",
"Generating 1000 samples for batch_size=1 sequence_length=128\n",
- "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=0,OMP_NUM_THREADS=,OMP_WAIT_POLICY=,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n",
- "Average latency = 3.01 ms, Throughput = 331.90 QPS\n",
- "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=PASSIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n",
- "Average latency = 3.12 ms, Throughput = 320.00 QPS\n",
- "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=12,OMP_NUM_THREADS=1,OMP_WAIT_POLICY=ACTIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n",
- "Average latency = 3.02 ms, Throughput = 331.39 QPS\n",
- "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=ACTIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n",
- "Average latency = 3.01 ms, Throughput = 332.53 QPS\n",
- "skip duplicated test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=PASSIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n",
- "skip duplicated test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=12,OMP_NUM_THREADS=1,OMP_WAIT_POLICY=ACTIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n",
- "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=12,OMP_NUM_THREADS=1,OMP_WAIT_POLICY=PASSIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n",
- "Average latency = 3.04 ms, Throughput = 328.67 QPS\n",
- "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=12,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=ACTIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n",
- "Average latency = 3.01 ms, Throughput = 331.72 QPS\n",
- "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=12,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=PASSIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n",
- "Average latency = 3.04 ms, Throughput = 329.32 QPS\n",
- "Test summary is saved to onnx/perf_results_GPU_B1_S128_20200617-232234.txt\n"
+ "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=4,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True\n",
+ "Average latency = 6.78 ms, Throughput = 147.54 QPS\n",
+ "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=3,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True\n",
+ "Average latency = 6.76 ms, Throughput = 147.85 QPS\n",
+ "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=2,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True\n",
+ "Average latency = 6.79 ms, Throughput = 147.30 QPS\n",
+ "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True\n",
+ "Average latency = 6.81 ms, Throughput = 146.75 QPS\n",
+ "Test summary is saved to onnx/perf_results_GPU_B1_S128_20210714-002224.txt\n"
]
}
],
"source": [
"GPU_OPTION = '--use_gpu' if use_gpu else ''\n",
- "!python -m onnxruntime_tools.transformers.bert_perf_test --model $optimized_fp16_model_path --batch_size 1 --sequence_length 128 --samples 1000 --test_times 1 --inclusive --all $GPU_OPTION"
+ "!python -m onnxruntime.transformers.bert_perf_test --model $optimized_fp16_model_path --batch_size 1 --sequence_length 128 --samples 1000 --test_times 1 $GPU_OPTION"
]
},
{
@@ -1071,7 +990,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "Float32 model perf results from ./onnx/perf_results_GPU_B1_S128_20200617-232234.txt\n"
+ "Float32 model perf results from ./onnx/perf_results_GPU_B1_S128_20210714-002224.txt\n"
]
},
{
@@ -1103,117 +1022,52 @@
" Latency_P99 | \n",
" Throughput(QPS) | \n",
" intra_op_num_threads | \n",
- " OMP_NUM_THREADS | \n",
- " OMP_WAIT_POLICY | \n",
- " contiguous | \n",
- " warmup | \n",
" \n",
" \n",
" \n",
" \n",
" | 0 | \n",
- " 3.01 | \n",
- " 2.79 | \n",
- " 2.81 | \n",
- " 2.86 | \n",
- " 5.08 | \n",
- " 7.16 | \n",
- " 332.53 | \n",
- " 1 | \n",
- " 12 | \n",
- " ACTIVE | \n",
- " None | \n",
- " True | \n",
+ " 6.76 | \n",
+ " 6.79 | \n",
+ " 6.81 | \n",
+ " 6.90 | \n",
+ " 6.91 | \n",
+ " 7.00 | \n",
+ " 147.85 | \n",
+ " 3 | \n",
"
\n",
" \n",
" | 1 | \n",
- " 3.01 | \n",
- " 2.80 | \n",
- " 2.81 | \n",
- " 2.88 | \n",
- " 4.52 | \n",
- " 7.05 | \n",
- " 331.90 | \n",
- " 0 | \n",
- " | \n",
- " | \n",
- " None | \n",
- " True | \n",
+ " 6.78 | \n",
+ " 6.70 | \n",
+ " 6.79 | \n",
+ " 6.87 | \n",
+ " 6.90 | \n",
+ " 7.63 | \n",
+ " 147.54 | \n",
+ " 4 | \n",
"
\n",
" \n",
" | 2 | \n",
- " 3.01 | \n",
- " 2.78 | \n",
- " 2.80 | \n",
- " 2.92 | \n",
- " 5.01 | \n",
- " 7.02 | \n",
- " 331.72 | \n",
- " 12 | \n",
- " 12 | \n",
- " ACTIVE | \n",
- " None | \n",
- " True | \n",
+ " 6.79 | \n",
+ " 6.79 | \n",
+ " 6.81 | \n",
+ " 6.89 | \n",
+ " 6.91 | \n",
+ " 7.19 | \n",
+ " 147.30 | \n",
+ " 2 | \n",
"
\n",
" \n",
" | 3 | \n",
- " 3.02 | \n",
- " 2.79 | \n",
- " 2.80 | \n",
- " 2.85 | \n",
- " 6.34 | \n",
- " 7.04 | \n",
- " 331.39 | \n",
- " 12 | \n",
- " 1 | \n",
- " ACTIVE | \n",
- " None | \n",
- " True | \n",
- "
\n",
- " \n",
- " | 4 | \n",
- " 3.04 | \n",
- " 2.80 | \n",
- " 2.82 | \n",
- " 2.93 | \n",
- " 5.56 | \n",
- " 7.08 | \n",
- " 329.32 | \n",
- " 12 | \n",
- " 12 | \n",
- " PASSIVE | \n",
- " None | \n",
- " True | \n",
- "
\n",
- " \n",
- " | 5 | \n",
- " 3.04 | \n",
- " 2.79 | \n",
- " 2.81 | \n",
- " 2.92 | \n",
- " 6.37 | \n",
- " 7.08 | \n",
- " 328.67 | \n",
- " 12 | \n",
- " 1 | \n",
- " PASSIVE | \n",
- " None | \n",
- " True | \n",
- "
\n",
- " \n",
- " | 6 | \n",
- " 3.12 | \n",
- " 2.79 | \n",
- " 2.82 | \n",
- " 2.96 | \n",
- " 6.66 | \n",
+ " 6.81 | \n",
+ " 6.80 | \n",
+ " 6.89 | \n",
+ " 6.91 | \n",
+ " 6.97 | \n",
" 7.20 | \n",
- " 320.00 | \n",
+ " 146.75 | \n",
" 1 | \n",
- " 12 | \n",
- " PASSIVE | \n",
- " None | \n",
- " True | \n",
"
\n",
" \n",
"\n",
@@ -1221,31 +1075,16 @@
],
"text/plain": [
" Latency(ms) Latency_P50 Latency_P75 Latency_P90 Latency_P95 \\\n",
- "0 3.01 2.79 2.81 2.86 5.08 \n",
- "1 3.01 2.80 2.81 2.88 4.52 \n",
- "2 3.01 2.78 2.80 2.92 5.01 \n",
- "3 3.02 2.79 2.80 2.85 6.34 \n",
- "4 3.04 2.80 2.82 2.93 5.56 \n",
- "5 3.04 2.79 2.81 2.92 6.37 \n",
- "6 3.12 2.79 2.82 2.96 6.66 \n",
+ "0 6.76 6.79 6.81 6.90 6.91 \n",
+ "1 6.78 6.70 6.79 6.87 6.90 \n",
+ "2 6.79 6.79 6.81 6.89 6.91 \n",
+ "3 6.81 6.80 6.89 6.91 6.97 \n",
"\n",
- " Latency_P99 Throughput(QPS) intra_op_num_threads OMP_NUM_THREADS \\\n",
- "0 7.16 332.53 1 12 \n",
- "1 7.05 331.90 0 \n",
- "2 7.02 331.72 12 12 \n",
- "3 7.04 331.39 12 1 \n",
- "4 7.08 329.32 12 12 \n",
- "5 7.08 328.67 12 1 \n",
- "6 7.20 320.00 1 12 \n",
- "\n",
- " OMP_WAIT_POLICY contiguous warmup \n",
- "0 ACTIVE None True \n",
- "1 None True \n",
- "2 ACTIVE None True \n",
- "3 ACTIVE None True \n",
- "4 PASSIVE None True \n",
- "5 PASSIVE None True \n",
- "6 PASSIVE None True "
+ " Latency_P99 Throughput(QPS) intra_op_num_threads \n",
+ "0 7.00 147.85 3 \n",
+ "1 7.63 147.54 4 \n",
+ "2 7.19 147.30 2 \n",
+ "3 7.20 146.75 1 "
]
},
"execution_count": 22,
@@ -1258,7 +1097,7 @@
"import glob \n",
"import pandas\n",
"latest_result_file = max(glob.glob(\"./onnx/perf_results_GPU_B1_S128_*.txt\"), key=os.path.getmtime)\n",
- "result_data = pandas.read_table(latest_result_file, converters={'OMP_NUM_THREADS': str, 'OMP_WAIT_POLICY':str})\n",
+ "result_data = pandas.read_table(latest_result_file)\n",
"print(\"Float32 model perf results from\", latest_result_file)\n",
"# Remove some columns that have same values for all rows.\n",
"columns_to_remove = ['model', 'graph_optimization_level', 'batch_size', 'sequence_length', 'test_cases', 'test_times', 'use_gpu']\n",
@@ -1286,47 +1125,43 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "test setting TestSetting(batch_size=32, sequence_length=128, test_cases=1000, test_times=1, contiguous=None, use_gpu=True, warmup=True, omp_num_threads=12, omp_wait_policy='ACTIVE', intra_op_num_threads=1, seed=3, verbose=False, inclusive=False, extra_latency=True)\n",
+ "test setting TestSetting(batch_size=32, sequence_length=128, test_cases=1000, test_times=1, use_gpu=True, intra_op_num_threads=3, seed=3, verbose=False)\n",
"Generating 1000 samples for batch_size=32 sequence_length=128\n",
- "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=ACTIVE,batch_size=32,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n",
- "Average latency = 16.17 ms, Throughput = 1979.41 QPS\n",
- "test setting TestSetting(batch_size=1, sequence_length=128, test_cases=1000, test_times=1, contiguous=None, use_gpu=True, warmup=True, omp_num_threads=12, omp_wait_policy='ACTIVE', intra_op_num_threads=1, seed=3, verbose=False, inclusive=False, extra_latency=True)\n",
+ "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=3,batch_size=32,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True\n",
+ "Average latency = 168.40 ms, Throughput = 190.02 QPS\n",
+ "test setting TestSetting(batch_size=1, sequence_length=128, test_cases=1000, test_times=1, use_gpu=True, intra_op_num_threads=3, seed=3, verbose=False)\n",
"Generating 1000 samples for batch_size=1 sequence_length=128\n",
- "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=ACTIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n",
- "Average latency = 3.00 ms, Throughput = 333.83 QPS\n",
- "test setting TestSetting(batch_size=2, sequence_length=128, test_cases=1000, test_times=1, contiguous=None, use_gpu=True, warmup=True, omp_num_threads=12, omp_wait_policy='ACTIVE', intra_op_num_threads=1, seed=3, verbose=False, inclusive=False, extra_latency=True)\n",
+ "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=3,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True\n",
+ "Average latency = 7.14 ms, Throughput = 140.00 QPS\n",
+ "test setting TestSetting(batch_size=2, sequence_length=128, test_cases=1000, test_times=1, use_gpu=True, intra_op_num_threads=3, seed=3, verbose=False)\n",
"Generating 1000 samples for batch_size=2 sequence_length=128\n",
- "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=ACTIVE,batch_size=2,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n",
- "Average latency = 3.59 ms, Throughput = 557.32 QPS\n",
- "test setting TestSetting(batch_size=64, sequence_length=128, test_cases=1000, test_times=1, contiguous=None, use_gpu=True, warmup=True, omp_num_threads=12, omp_wait_policy='ACTIVE', intra_op_num_threads=1, seed=3, verbose=False, inclusive=False, extra_latency=True)\n",
- "Generating 1000 samples for batch_size=64 sequence_length=128\n",
- "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=ACTIVE,batch_size=64,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n",
- "Average latency = 29.26 ms, Throughput = 2187.15 QPS\n",
- "test setting TestSetting(batch_size=4, sequence_length=128, test_cases=1000, test_times=1, contiguous=None, use_gpu=True, warmup=True, omp_num_threads=12, omp_wait_policy='ACTIVE', intra_op_num_threads=1, seed=3, verbose=False, inclusive=False, extra_latency=True)\n",
+ "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=3,batch_size=2,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True\n",
+ "Average latency = 11.27 ms, Throughput = 177.41 QPS\n",
+ "test setting TestSetting(batch_size=4, sequence_length=128, test_cases=1000, test_times=1, use_gpu=True, intra_op_num_threads=3, seed=3, verbose=False)\n",
"Generating 1000 samples for batch_size=4 sequence_length=128\n",
- "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=ACTIVE,batch_size=4,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n",
- "Average latency = 4.32 ms, Throughput = 926.92 QPS\n",
- "test setting TestSetting(batch_size=8, sequence_length=128, test_cases=1000, test_times=1, contiguous=None, use_gpu=True, warmup=True, omp_num_threads=12, omp_wait_policy='ACTIVE', intra_op_num_threads=1, seed=3, verbose=False, inclusive=False, extra_latency=True)\n",
+ "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=3,batch_size=4,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True\n",
+ "Average latency = 21.15 ms, Throughput = 189.09 QPS\n",
+ "test setting TestSetting(batch_size=8, sequence_length=128, test_cases=1000, test_times=1, use_gpu=True, intra_op_num_threads=3, seed=3, verbose=False)\n",
"Generating 1000 samples for batch_size=8 sequence_length=128\n",
- "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=ACTIVE,batch_size=8,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n",
- "Average latency = 6.32 ms, Throughput = 1266.63 QPS\n",
- "test setting TestSetting(batch_size=16, sequence_length=128, test_cases=1000, test_times=1, contiguous=None, use_gpu=True, warmup=True, omp_num_threads=12, omp_wait_policy='ACTIVE', intra_op_num_threads=1, seed=3, verbose=False, inclusive=False, extra_latency=True)\n",
+ "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=3,batch_size=8,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True\n",
+ "Average latency = 42.27 ms, Throughput = 189.27 QPS\n",
+ "test setting TestSetting(batch_size=16, sequence_length=128, test_cases=1000, test_times=1, use_gpu=True, intra_op_num_threads=3, seed=3, verbose=False)\n",
"Generating 1000 samples for batch_size=16 sequence_length=128\n",
- "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=ACTIVE,batch_size=16,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n",
- "Average latency = 9.60 ms, Throughput = 1666.05 QPS\n",
- "Test summary is saved to onnx/perf_results_GPU_B1-2-4-8-16-32-64_S128_20200617-232401.txt\n"
+ "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=3,batch_size=16,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True\n",
+ "Average latency = 83.77 ms, Throughput = 191.01 QPS\n",
+ "Test summary is saved to onnx/perf_results_GPU_B1-2-4-8-16-32_S128_20210714-002816.txt\n"
]
}
],
"source": [
"GPU_OPTION = '--use_gpu' if use_gpu else ''\n",
- "THREAD_SETTING = '--intra_op_num_threads 1 --omp_num_threads {} --omp_wait_policy ACTIVE'.format(psutil.cpu_count(logical=True))\n",
- "!python -m onnxruntime_tools.transformers.bert_perf_test --model $optimized_fp16_model_path --batch_size 1 2 4 8 16 32 64 --sequence_length 128 --samples 1000 --test_times 1 --inclusive $THREAD_SETTING $GPU_OPTION"
+ "THREAD_SETTING = '--intra_op_num_threads 3'\n",
+ "!{sys.executable} -m onnxruntime.transformers.bert_perf_test --model $optimized_fp16_model_path --batch_size 1 2 4 8 16 32 --sequence_length 128 --samples 1000 --test_times 1 $THREAD_SETTING $GPU_OPTION"
]
},
{
"cell_type": "code",
- "execution_count": 26,
+ "execution_count": 24,
"metadata": {
"scrolled": false
},
@@ -1335,7 +1170,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "Float16 model summary from ./onnx/perf_results_GPU_B1-2-4-8-16-32-64_S128_20200617-232401.txt\n"
+ "Float16 model summary from ./onnx/perf_results_GPU_B1-2-4-8-16-32_S128_20210714-002816.txt\n"
]
},
{
@@ -1372,106 +1207,93 @@
" \n",
" \n",
" | 0 | \n",
- " 3.00 | \n",
- " 2.79 | \n",
- " 2.81 | \n",
- " 2.86 | \n",
- " 4.37 | \n",
- " 7.08 | \n",
- " 333.83 | \n",
+ " 7.14 | \n",
+ " 7.10 | \n",
+ " 7.13 | \n",
+ " 7.25 | \n",
+ " 7.35 | \n",
+ " 10.99 | \n",
+ " 140.00 | \n",
" 1 | \n",
"
\n",
" \n",
" | 1 | \n",
- " 3.59 | \n",
- " 3.33 | \n",
- " 3.35 | \n",
- " 3.42 | \n",
- " 6.60 | \n",
- " 7.54 | \n",
- " 557.32 | \n",
+ " 11.27 | \n",
+ " 11.23 | \n",
+ " 11.28 | \n",
+ " 11.53 | \n",
+ " 11.57 | \n",
+ " 12.05 | \n",
+ " 177.41 | \n",
" 2 | \n",
"
\n",
" \n",
" | 2 | \n",
- " 4.32 | \n",
- " 3.98 | \n",
- " 4.01 | \n",
- " 4.64 | \n",
- " 7.23 | \n",
- " 8.11 | \n",
- " 926.92 | \n",
+ " 21.15 | \n",
+ " 21.13 | \n",
+ " 21.25 | \n",
+ " 21.44 | \n",
+ " 21.59 | \n",
+ " 22.07 | \n",
+ " 189.09 | \n",
" 4 | \n",
"
\n",
" \n",
" | 3 | \n",
- " 6.32 | \n",
- " 5.94 | \n",
- " 5.97 | \n",
- " 7.61 | \n",
- " 8.96 | \n",
- " 10.12 | \n",
- " 1266.63 | \n",
+ " 42.27 | \n",
+ " 42.26 | \n",
+ " 42.68 | \n",
+ " 42.95 | \n",
+ " 43.11 | \n",
+ " 45.11 | \n",
+ " 189.27 | \n",
" 8 | \n",
"
\n",
" \n",
" | 4 | \n",
- " 9.60 | \n",
- " 9.22 | \n",
- " 9.25 | \n",
- " 11.32 | \n",
- " 12.33 | \n",
- " 13.34 | \n",
- " 1666.05 | \n",
+ " 83.77 | \n",
+ " 83.84 | \n",
+ " 84.29 | \n",
+ " 84.94 | \n",
+ " 85.35 | \n",
+ " 86.34 | \n",
+ " 191.01 | \n",
" 16 | \n",
"
\n",
" \n",
" | 5 | \n",
- " 16.17 | \n",
- " 15.80 | \n",
- " 15.90 | \n",
- " 17.38 | \n",
- " 18.80 | \n",
- " 19.93 | \n",
- " 1979.41 | \n",
+ " 168.40 | \n",
+ " 169.62 | \n",
+ " 170.78 | \n",
+ " 171.94 | \n",
+ " 172.82 | \n",
+ " 174.28 | \n",
+ " 190.02 | \n",
" 32 | \n",
"
\n",
- " \n",
- " | 6 | \n",
- " 29.26 | \n",
- " 28.89 | \n",
- " 29.01 | \n",
- " 30.63 | \n",
- " 32.53 | \n",
- " 33.28 | \n",
- " 2187.15 | \n",
- " 64 | \n",
- "
\n",
" \n",
"\n",
""
],
"text/plain": [
" Latency(ms) Latency_P50 Latency_P75 Latency_P90 Latency_P95 \\\n",
- "0 3.00 2.79 2.81 2.86 4.37 \n",
- "1 3.59 3.33 3.35 3.42 6.60 \n",
- "2 4.32 3.98 4.01 4.64 7.23 \n",
- "3 6.32 5.94 5.97 7.61 8.96 \n",
- "4 9.60 9.22 9.25 11.32 12.33 \n",
- "5 16.17 15.80 15.90 17.38 18.80 \n",
- "6 29.26 28.89 29.01 30.63 32.53 \n",
+ "0 7.14 7.10 7.13 7.25 7.35 \n",
+ "1 11.27 11.23 11.28 11.53 11.57 \n",
+ "2 21.15 21.13 21.25 21.44 21.59 \n",
+ "3 42.27 42.26 42.68 42.95 43.11 \n",
+ "4 83.77 83.84 84.29 84.94 85.35 \n",
+ "5 168.40 169.62 170.78 171.94 172.82 \n",
"\n",
" Latency_P99 Throughput(QPS) batch_size \n",
- "0 7.08 333.83 1 \n",
- "1 7.54 557.32 2 \n",
- "2 8.11 926.92 4 \n",
- "3 10.12 1266.63 8 \n",
- "4 13.34 1666.05 16 \n",
- "5 19.93 1979.41 32 \n",
- "6 33.28 2187.15 64 "
+ "0 10.99 140.00 1 \n",
+ "1 12.05 177.41 2 \n",
+ "2 22.07 189.09 4 \n",
+ "3 45.11 189.27 8 \n",
+ "4 86.34 191.01 16 \n",
+ "5 174.28 190.02 32 "
]
},
- "execution_count": 26,
+ "execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
@@ -1481,10 +1303,10 @@
"import glob \n",
"import pandas\n",
"latest_result_file = max(glob.glob(\"./onnx/perf_results_*.txt\"), key=os.path.getmtime)\n",
- "result_data = pandas.read_table(latest_result_file, converters={'OMP_NUM_THREADS': str, 'OMP_WAIT_POLICY':str})\n",
+ "result_data = pandas.read_table(latest_result_file)\n",
"print(\"Float16 model summary from\", latest_result_file)\n",
- "columns_to_remove = ['model', 'graph_optimization_level', 'test_cases', 'test_times', 'use_gpu', 'warmup', 'sequence_length']\n",
- "columns_to_remove.extend(['intra_op_num_threads', 'OMP_NUM_THREADS', 'OMP_WAIT_POLICY', 'contiguous'])\n",
+ "columns_to_remove = ['model', 'graph_optimization_level', 'test_cases', 'test_times', 'use_gpu', 'sequence_length']\n",
+ "columns_to_remove.extend(['intra_op_num_threads'])\n",
"result_data.drop(columns_to_remove, axis=1, inplace=True)\n",
"result_data"
]
@@ -1506,7 +1328,7 @@
},
{
"cell_type": "code",
- "execution_count": 27,
+ "execution_count": 25,
"metadata": {
"scrolled": true
},
@@ -1517,42 +1339,126 @@
"text": [
"{\r\n",
" \"gpu\": {\r\n",
- " \"driver_version\": \"440.64.00\",\r\n",
+ " \"driver_version\": \"450.51.05\",\r\n",
" \"devices\": [\r\n",
" {\r\n",
- " \"memory_total\": 16945512448,\r\n",
- " \"memory_available\": 14110883840,\r\n",
- " \"name\": \"Tesla V100-PCIE-16GB\"\r\n",
- " },\r\n",
- " {\r\n",
- " \"memory_total\": 16945512448,\r\n",
- " \"memory_available\": 16932601856,\r\n",
- " \"name\": \"Tesla V100-PCIE-16GB\"\r\n",
+ " \"memory_total\": 15843721216,\r\n",
+ " \"memory_available\": 9313189888,\r\n",
+ " \"name\": \"Tesla T4\"\r\n",
" }\r\n",
" ]\r\n",
" },\r\n",
" \"cpu\": {\r\n",
- " \"brand\": \"Intel(R) Xeon(R) CPU E5-2690 v4 @ 2.60GHz\",\r\n",
- " \"cores\": 12,\r\n",
- " \"logical_cores\": 12,\r\n",
- " \"hz\": \"2.5940 GHz\",\r\n",
- " \"l2_cache\": \"256 KB\",\r\n",
- " \"l3_cache\": \"35840 KB\",\r\n",
+ " \"brand\": \"AMD EPYC 7V12 64-Core Processor\",\r\n",
+ " \"cores\": 4,\r\n",
+ " \"logical_cores\": 4,\r\n",
+ " \"hz\": [\r\n",
+ " 2445417000,\r\n",
+ " 0\r\n",
+ " ],\r\n",
+ " \"l2_cache\": 524288,\r\n",
+ " \"flags\": [\r\n",
+ " \"3dnowext\",\r\n",
+ " \"3dnowprefetch\",\r\n",
+ " \"abm\",\r\n",
+ " \"adx\",\r\n",
+ " \"aes\",\r\n",
+ " \"apic\",\r\n",
+ " \"arat\",\r\n",
+ " \"avx\",\r\n",
+ " \"avx2\",\r\n",
+ " \"bmi1\",\r\n",
+ " \"bmi2\",\r\n",
+ " \"clflush\",\r\n",
+ " \"clflushopt\",\r\n",
+ " \"clwb\",\r\n",
+ " \"cmov\",\r\n",
+ " \"cmp_legacy\",\r\n",
+ " \"cpuid\",\r\n",
+ " \"cr8_legacy\",\r\n",
+ " \"cx16\",\r\n",
+ " \"cx8\",\r\n",
+ " \"de\",\r\n",
+ " \"extd_apicid\",\r\n",
+ " \"f16c\",\r\n",
+ " \"fma\",\r\n",
+ " \"fpu\",\r\n",
+ " \"fsgsbase\",\r\n",
+ " \"fxsr\",\r\n",
+ " \"fxsr_opt\",\r\n",
+ " \"ht\",\r\n",
+ " \"hypervisor\",\r\n",
+ " \"lahf_lm\",\r\n",
+ " \"lm\",\r\n",
+ " \"mca\",\r\n",
+ " \"mce\",\r\n",
+ " \"misalignsse\",\r\n",
+ " \"mmx\",\r\n",
+ " \"mmxext\",\r\n",
+ " \"movbe\",\r\n",
+ " \"msr\",\r\n",
+ " \"mtrr\",\r\n",
+ " \"nopl\",\r\n",
+ " \"nx\",\r\n",
+ " \"osvw\",\r\n",
+ " \"osxsave\",\r\n",
+ " \"pae\",\r\n",
+ " \"pat\",\r\n",
+ " \"pclmulqdq\",\r\n",
+ " \"pdpe1gb\",\r\n",
+ " \"pge\",\r\n",
+ " \"pni\",\r\n",
+ " \"popcnt\",\r\n",
+ " \"pse\",\r\n",
+ " \"pse36\",\r\n",
+ " \"rdpid\",\r\n",
+ " \"rdrand\",\r\n",
+ " \"rdrnd\",\r\n",
+ " \"rdseed\",\r\n",
+ " \"rdtscp\",\r\n",
+ " \"rep_good\",\r\n",
+ " \"sep\",\r\n",
+ " \"sha\",\r\n",
+ " \"sha_ni\",\r\n",
+ " \"smap\",\r\n",
+ " \"smep\",\r\n",
+ " \"ssbd\",\r\n",
+ " \"sse\",\r\n",
+ " \"sse2\",\r\n",
+ " \"sse4_1\",\r\n",
+ " \"sse4_2\",\r\n",
+ " \"sse4a\",\r\n",
+ " \"ssse3\",\r\n",
+ " \"syscall\",\r\n",
+ " \"topoext\",\r\n",
+ " \"tsc\",\r\n",
+ " \"umip\",\r\n",
+ " \"vme\",\r\n",
+ " \"vmmcall\",\r\n",
+ " \"xgetbv1\",\r\n",
+ " \"xsave\",\r\n",
+ " \"xsavec\",\r\n",
+ " \"xsaveerptr\",\r\n",
+ " \"xsaveopt\",\r\n",
+ " \"xsaves\"\r\n",
+ " ],\r\n",
" \"processor\": \"x86_64\"\r\n",
" },\r\n",
" \"memory\": {\r\n",
- " \"total\": 236645588992,\r\n",
- " \"available\": 222567559168\r\n",
+ " \"total\": 29450223616,\r\n",
+ " \"available\": 22402334720\r\n",
" },\r\n",
- " \"python\": \"3.7.7.final.0 (64 bit)\",\r\n",
- " \"os\": \"Linux-4.15.0-1089-azure-x86_64-with-debian-stretch-sid\",\r\n",
+ " \"python\": \"3.6.13.final.0 (64 bit)\",\r\n",
+ " \"os\": \"Linux-5.4.0-1046-azure-x86_64-with-debian-buster-sid\",\r\n",
" \"onnxruntime\": {\r\n",
- " \"version\": \"1.3.0\",\r\n",
+ " \"version\": \"1.8.1\",\r\n",
" \"support_gpu\": true\r\n",
" },\r\n",
+ " \"onnxruntime_tools\": null,\r\n",
" \"pytorch\": {\r\n",
- " \"version\": \"1.5.0\",\r\n",
- " \"support_gpu\": true\r\n",
+ " \"version\": \"1.9.0+cu111\",\r\n",
+ " \"support_gpu\": true,\r\n",
+ " \"cuda\": \"11.1\"\r\n",
" },\r\n",
" \"tensorflow\": null\r\n",
"}\r\n"
@@ -1560,15 +1466,15 @@
}
],
"source": [
- "!{sys.executable} -m onnxruntime_tools.transformers.machine_info --silent"
+ "!{sys.executable} -m onnxruntime.transformers.machine_info --silent"
]
}
],
"metadata": {
"kernelspec": {
- "display_name": "gpu_env_py37",
+ "display_name": "gpu_env",
"language": "python",
- "name": "gpu_env_py37"
+ "name": "gpu_env"
},
"language_info": {
"codemirror_mode": {
@@ -1580,7 +1486,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.7.7"
+ "version": "3.6.13"
}
},
"nbformat": 4,