update bert notebook to use onnxruntime 1.8.1 (#8379)

This commit is contained in:
Tianlei Wu 2021-07-19 14:16:59 -07:00 committed by GitHub
parent afce0e2543
commit dfe42e185c
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 685 additions and 814 deletions

View file

@ -47,76 +47,23 @@
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Looking in links: https://download.pytorch.org/whl/torch_stable.html\n",
"Requirement already up-to-date: torch==1.6.0+cpu in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (1.6.0+cpu)\n",
"Requirement already up-to-date: torchvision==0.7.0+cpu in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (0.7.0+cpu)\n",
"Requirement already satisfied, skipping upgrade: numpy in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from torch==1.6.0+cpu) (1.18.1)\n",
"Requirement already satisfied, skipping upgrade: future in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from torch==1.6.0+cpu) (0.18.2)\n",
"Requirement already satisfied, skipping upgrade: pillow>=4.1.1 in c:\\users\\tianl\\appdata\\roaming\\python\\python36\\site-packages (from torchvision==0.7.0+cpu) (7.0.0)\n",
"Requirement already up-to-date: onnxruntime==1.4.0 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (1.4.0)\n",
"Requirement already satisfied, skipping upgrade: numpy>=1.16.6 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from onnxruntime==1.4.0) (1.18.1)\n",
"Requirement already satisfied, skipping upgrade: protobuf in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from onnxruntime==1.4.0) (3.11.3)\n",
"Requirement already satisfied, skipping upgrade: six>=1.9 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from protobuf->onnxruntime==1.4.0) (1.14.0)\n",
"Requirement already satisfied, skipping upgrade: setuptools in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from protobuf->onnxruntime==1.4.0) (45.2.0.post20200210)\n",
"Requirement already up-to-date: onnxruntime-tools in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (1.4.1)\n",
"Requirement already satisfied, skipping upgrade: py-cpuinfo in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from onnxruntime-tools) (5.0.0)\n",
"Requirement already satisfied, skipping upgrade: packaging in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from onnxruntime-tools) (20.1)\n",
"Requirement already satisfied, skipping upgrade: coloredlogs in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from onnxruntime-tools) (14.0)\n",
"Requirement already satisfied, skipping upgrade: py3nvml in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from onnxruntime-tools) (0.2.5)\n",
"Requirement already satisfied, skipping upgrade: numpy in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from onnxruntime-tools) (1.18.1)\n",
"Requirement already satisfied, skipping upgrade: psutil in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from onnxruntime-tools) (5.7.0)\n",
"Requirement already satisfied, skipping upgrade: onnx in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from onnxruntime-tools) (1.7.0)\n",
"Requirement already satisfied, skipping upgrade: pyparsing>=2.0.2 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from packaging->onnxruntime-tools) (2.4.6)\n",
"Requirement already satisfied, skipping upgrade: six in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from packaging->onnxruntime-tools) (1.14.0)\n",
"Requirement already satisfied, skipping upgrade: humanfriendly>=7.1 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from coloredlogs->onnxruntime-tools) (8.1)\n",
"Requirement already satisfied, skipping upgrade: xmltodict in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from py3nvml->onnxruntime-tools) (0.12.0)\n",
"Requirement already satisfied, skipping upgrade: protobuf in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from onnx->onnxruntime-tools) (3.11.3)\n",
"Requirement already satisfied, skipping upgrade: typing-extensions>=3.6.2.1 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from onnx->onnxruntime-tools) (3.7.4.1)\n",
"Requirement already satisfied, skipping upgrade: pyreadline; sys_platform == \"win32\" in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from humanfriendly>=7.1->coloredlogs->onnxruntime-tools) (2.1)\n",
"Requirement already satisfied, skipping upgrade: setuptools in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from protobuf->onnx->onnxruntime-tools) (45.2.0.post20200210)\n",
"Requirement already satisfied: transformers==3.0.2 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (3.0.2)\n",
"Requirement already satisfied: packaging in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from transformers==3.0.2) (20.1)\n",
"Requirement already satisfied: sentencepiece!=0.1.92 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from transformers==3.0.2) (0.1.85)\n",
"Requirement already satisfied: sacremoses in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from transformers==3.0.2) (0.0.38)\n",
"Requirement already satisfied: dataclasses; python_version < \"3.7\" in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from transformers==3.0.2) (0.7)\n",
"Requirement already satisfied: requests in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from transformers==3.0.2) (2.23.0)\n",
"Requirement already satisfied: tokenizers==0.8.1.rc1 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from transformers==3.0.2) (0.8.1rc1)\n",
"Requirement already satisfied: tqdm>=4.27 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from transformers==3.0.2) (4.43.0)\n",
"Requirement already satisfied: regex!=2019.12.17 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from transformers==3.0.2) (2020.2.20)\n",
"Requirement already satisfied: filelock in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from transformers==3.0.2) (3.0.12)\n",
"Requirement already satisfied: numpy in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from transformers==3.0.2) (1.18.1)\n",
"Requirement already satisfied: pyparsing>=2.0.2 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from packaging->transformers==3.0.2) (2.4.6)\n",
"Requirement already satisfied: six in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from packaging->transformers==3.0.2) (1.14.0)\n",
"Requirement already satisfied: click in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from sacremoses->transformers==3.0.2) (7.0)\n",
"Requirement already satisfied: joblib in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from sacremoses->transformers==3.0.2) (0.14.1)\n",
"Requirement already satisfied: certifi>=2017.4.17 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from requests->transformers==3.0.2) (2020.4.5.1)\n",
"Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from requests->transformers==3.0.2) (1.25.8)\n",
"Requirement already satisfied: idna<3,>=2.5 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from requests->transformers==3.0.2) (2.9)\n",
"Requirement already satisfied: chardet<4,>=3.0.2 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from requests->transformers==3.0.2) (3.0.4)\n",
"Requirement already satisfied: wget in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (3.2)\n",
"Requirement already satisfied: netron in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (3.9.6)\n"
]
}
],
"outputs": [],
"source": [
"# Install PyTorch 1.6.0 and OnnxRuntime 1.4.0 for CPU-only.\n",
"import sys\n",
"if sys.platform == 'darwin': # Mac\n",
" !{sys.executable} -m pip install --upgrade torch torchvision\n",
"else:\n",
" !{sys.executable} -m pip install --upgrade torch==1.6.0+cpu torchvision==0.7.0+cpu -f https://download.pytorch.org/whl/torch_stable.html\n",
"!{sys.executable} -m pip install --upgrade onnxruntime==1.4.0\n",
"!{sys.executable} -m pip install --upgrade onnxconverter_common\n",
"!{sys.executable} -m pip install --upgrade onnxruntime-tools\n",
"\n",
"# Install other packages used in this notebook.\n",
"!{sys.executable} -m pip install transformers==3.0.2\n",
"!{sys.executable} -m pip install wget netron"
"run_install = False # Only need install once\n",
"if run_install:\n",
" if sys.platform in ['linux', 'win32']: # Linux or Windows\n",
" !{sys.executable} -m pip install --upgrade torch torchvision torchaudio\n",
" else: # Mac\n",
" !{sys.executable} -m pip install torch==1.9.0+cpu torchvision==0.10.0+cpu torchaudio==0.9.0 -f https://download.pytorch.org/whl/torch_stable.html\n",
"\n",
" !{sys.executable} -m pip install onnxruntime==1.8.1 onnx==1.9.0 onnxconverter_common==1.8.1\n",
"\n",
" # Install other packages used in this notebook.\n",
" !{sys.executable} -m pip install transformers==4.8.2\n",
" !{sys.executable} -m pip install psutil pytz pandas py-cpuinfo py3nvml\n",
" !{sys.executable} -m pip install wget netron"
]
},
{
@ -196,14 +143,14 @@
"name": "stderr",
"output_type": "stream",
"text": [
"Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForQuestionAnswering: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']\n",
"- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).\n",
"Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForQuestionAnswering: ['cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']\n",
"- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
"- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
"Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']\n",
"You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
"100%|██████████████████████████████████████████████████████████████████████████████████| 48/48 [00:03<00:00, 12.87it/s]\n",
"convert squad examples to features: 100%|███████████████████████████████████████████| 100/100 [00:00<00:00, 131.41it/s]\n",
"add example index and unique id: 100%|████████████████████████████████████████████| 100/100 [00:00<00:00, 96776.74it/s]\n"
"100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 48/48 [00:03<00:00, 12.15it/s]\n",
"convert squad examples to features: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 135.87it/s]\n",
"add example index and unique id: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 100031.10it/s]\n"
]
}
],
@ -252,6 +199,14 @@
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\git\\transformers\\src\\transformers\\modeling_utils.py:2074: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
" input_tensor.shape[chunk_dim] == tensor_shape for input_tensor in input_tensors\n"
]
},
{
"name": "stdout",
"output_type": "stream",
@ -319,7 +274,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"PyTorch cpu Inference time = 144.73 ms\n"
"PyTorch cpu Inference time = 119.80 ms\n"
]
}
],
@ -348,45 +303,26 @@
"source": [
"## 4. Inference ONNX Model with ONNX Runtime ##\n",
"\n",
"### OpenMP Environment Variable\n",
"\n",
"OpenMP environment variables are very important for CPU inference of Bert model. It has large performance impact on Bert model so you might need set it carefully according to [Performance Test Tool](#Performance-Test-Tool) result in later part of this notebook.\n",
"\n",
"Setting environment variables shall be done before importing onnxruntime. Otherwise, they might not take effect."
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"import psutil\n",
"\n",
"# You may change the settings in this cell according to Performance Test Tool result.\n",
"os.environ[\"OMP_NUM_THREADS\"] = str(psutil.cpu_count(logical=True))\n",
"os.environ[\"OMP_WAIT_POLICY\"] = 'ACTIVE'"
"For Onnx Runtime 1.6.0 or older, OpenMP environment variables are very important for CPU inference of Bert model. Since 1.7.0, the official package is not built with OpenMP."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Now we are ready to inference the model with ONNX Runtime. Here we can see that OnnxRuntime has better performance than PyTorch. \n",
"\n",
"It is better to use standalone python script like [Performance Test tool](#Performance-Test-tool) to get accurate performance results."
"Now we inference the model with ONNX Runtime. Here we can see that OnnxRuntime has better performance than PyTorch. "
]
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"OnnxRuntime cpu Inference time = 88.55 ms\n"
"OnnxRuntime cpu Inference time = 72.46 ms\n"
]
}
],
@ -394,19 +330,15 @@
"import onnxruntime\n",
"import numpy\n",
"\n",
"# Print warning if user uses onnxruntime-gpu instead of onnxruntime package.\n",
"if 'CUDAExecutionProvider' in onnxruntime.get_available_providers():\n",
" print(\"warning: onnxruntime-gpu is not built with OpenMP. You might try onnxruntime package to test CPU inference.\")\n",
"\n",
"sess_options = onnxruntime.SessionOptions()\n",
"\n",
"# Optional: store the optimized graph and view it using Netron to verify that model is fully optimized.\n",
"# Note that this will increase session creation time, so it is for debugging only.\n",
"sess_options.optimized_model_filepath = os.path.join(output_dir, \"optimized_model_cpu.onnx\")\n",
"\n",
"# For OnnxRuntime 1.2.0, you might need set intra_op_num_threads to 1 to enable OpenMP\n",
"# sess_options.intra_op_num_threads=1\n",
"# For OnnxRuntime 1.3.0 or later, it is recommended to use the default setting so you need not set it.\n",
"# For OnnxRuntime 1.7.0 or later, you can set intra_op_num_threads to set thread number like\n",
"# sess_options.intra_op_num_threads=4\n",
"# Here we use the default value which is a good choice in most cases.\n",
"\n",
"# Specify providers when you use onnxruntime-gpu for CPU inference.\n",
"session = onnxruntime.InferenceSession(export_model_path, sess_options, providers=['CPUExecutionProvider'])\n",
@ -427,7 +359,7 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 8,
"metadata": {},
"outputs": [
{
@ -474,17 +406,17 @@
"\n",
"Example Usage:\n",
"```\n",
"from onnxruntime_tools import optimizer\n",
"from onnxruntime.transformers import optimizer\n",
"optimized_model = optimizer.optimize_model(export_model_path, model_type='bert', num_heads=12, hidden_size=768)\n",
"optimized_model.save_model_to_file(optimized_model_path)\n",
"```\n",
"\n",
"You can also use optimizer_cli like the following:"
"You can also use command line like the following:"
]
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 9,
"metadata": {},
"outputs": [
{
@ -493,15 +425,17 @@
"text": [
" apply: Fused LayerNormalization count: 25\n",
" apply: Fused Gelu count: 12\n",
" apply: Fused SkipLayerNormalization count: 25\n",
"adjust_reshape_and_expand: Removed Reshape and Expand count: 0\n",
" apply: Fused SkipLayerNormalization count: 24\n",
" apply: Fused Attention count: 12\n",
" prune_graph: Graph pruned: 0 inputs, 0 outputs and 5 nodes are removed\n",
" apply: Fused EmbedLayerNormalization(with mask) count: 1\n",
" prune_graph: Graph pruned: 0 inputs, 0 outputs and 12 nodes are removed\n",
" prune_graph: Graph pruned: 0 inputs, 0 outputs and 3 nodes are removed\n",
" prune_graph: Graph pruned: 0 inputs, 0 outputs and 0 nodes are removed\n",
" apply: Fused BiasGelu count: 12\n",
" apply: Fused SkipLayerNormalization(add bias) count: 24\n",
" optimize: opset verion: 11\n",
" save_model_to_file: Sort graphs in topological order\n",
" save_model_to_file: Output model to ..\\onnx_models\\bert-base-cased-squad_opt_cpu.onnx\n",
"get_fused_operator_statistics: Optimized operators:{'EmbedLayerNormalization': 1, 'Attention': 12, 'Gelu': 0, 'FastGelu': 0, 'BiasGelu': 12, 'LayerNormalization': 0, 'SkipLayerNormalization': 24}\n",
" main: The model has been fully optimized.\n"
@ -511,7 +445,7 @@
"source": [
"optimized_model_path = os.path.join(output_dir, 'bert-base-cased-squad_opt_cpu.onnx')\n",
"\n",
"!{sys.executable} -m onnxruntime_tools.optimizer_cli --input $export_model_path --output $optimized_model_path --model_type bert --num_heads 12 --hidden_size 768"
"!{sys.executable} -m onnxruntime.transformers.optimizer --input $export_model_path --output $optimized_model_path --model_type bert --num_heads 12 --hidden_size 768"
]
},
{
@ -527,7 +461,7 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 10,
"metadata": {},
"outputs": [
{
@ -561,7 +495,7 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 11,
"metadata": {},
"outputs": [
{
@ -569,13 +503,13 @@
"output_type": "stream",
"text": [
"100% passed for 100 random inputs given thresholds (rtol=0.001, atol=0.0001).\n",
"maximum absolute difference=5.930662155151367e-06\n",
"maximum relative difference=0.021568937227129936\n"
"maximum absolute difference=4.604458808898926e-06\n",
"maximum relative difference=0.006278202868998051\n"
]
}
],
"source": [
"!{sys.executable} -m onnxruntime_tools.transformers.compare_bert_results --baseline_model $export_model_path --optimized_model $optimized_model_path --batch_size 1 --sequence_length 128 --samples 100"
"!{sys.executable} -m onnxruntime.transformers.compare_bert_results --baseline_model $export_model_path --optimized_model $optimized_model_path --batch_size 1 --sequence_length 128 --samples 100"
]
},
{
@ -591,45 +525,45 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=0,OMP_NUM_THREADS=,OMP_WAIT_POLICY=,batch_size=1,sequence_length=128,test_cases=100,test_times=1,contiguous=None,use_gpu=False,warmup=True\n",
"Average latency = 80.08 ms, Throughput = 12.49 QPS\n",
"Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=,OMP_WAIT_POLICY=,batch_size=1,sequence_length=128,test_cases=100,test_times=1,contiguous=None,use_gpu=False,warmup=True\n",
"Average latency = 78.56 ms, Throughput = 12.73 QPS\n",
"Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=ACTIVE,batch_size=1,sequence_length=128,test_cases=100,test_times=1,contiguous=None,use_gpu=False,warmup=True\n",
"Average latency = 77.78 ms, Throughput = 12.86 QPS\n",
"Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=PASSIVE,batch_size=1,sequence_length=128,test_cases=100,test_times=1,contiguous=None,use_gpu=False,warmup=True\n",
"Average latency = 73.52 ms, Throughput = 13.60 QPS\n",
"Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=6,OMP_WAIT_POLICY=ACTIVE,batch_size=1,sequence_length=128,test_cases=100,test_times=1,contiguous=None,use_gpu=False,warmup=True\n",
"Average latency = 95.36 ms, Throughput = 10.49 QPS\n",
"Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=6,OMP_WAIT_POLICY=PASSIVE,batch_size=1,sequence_length=128,test_cases=100,test_times=1,contiguous=None,use_gpu=False,warmup=True\n",
"Average latency = 78.49 ms, Throughput = 12.74 QPS\n",
"Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=12,OMP_NUM_THREADS=1,OMP_WAIT_POLICY=ACTIVE,batch_size=1,sequence_length=128,test_cases=100,test_times=1,contiguous=None,use_gpu=False,warmup=True\n",
"Average latency = 288.71 ms, Throughput = 3.46 QPS\n",
"Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=12,OMP_NUM_THREADS=1,OMP_WAIT_POLICY=PASSIVE,batch_size=1,sequence_length=128,test_cases=100,test_times=1,contiguous=None,use_gpu=False,warmup=True\n",
"Average latency = 288.61 ms, Throughput = 3.46 QPS\n",
"Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=6,OMP_NUM_THREADS=1,OMP_WAIT_POLICY=ACTIVE,batch_size=1,sequence_length=128,test_cases=100,test_times=1,contiguous=None,use_gpu=False,warmup=True\n",
"Average latency = 288.97 ms, Throughput = 3.46 QPS\n",
"Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=6,OMP_NUM_THREADS=1,OMP_WAIT_POLICY=PASSIVE,batch_size=1,sequence_length=128,test_cases=100,test_times=1,contiguous=None,use_gpu=False,warmup=True\n",
"Average latency = 288.37 ms, Throughput = 3.47 QPS\n",
"Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=6,OMP_NUM_THREADS=6,OMP_WAIT_POLICY=ACTIVE,batch_size=1,sequence_length=128,test_cases=100,test_times=1,contiguous=None,use_gpu=False,warmup=True\n",
"Average latency = 95.49 ms, Throughput = 10.47 QPS\n",
"Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=6,OMP_NUM_THREADS=6,OMP_WAIT_POLICY=PASSIVE,batch_size=1,sequence_length=128,test_cases=100,test_times=1,contiguous=None,use_gpu=False,warmup=True\n",
"Average latency = 79.17 ms, Throughput = 12.63 QPS\n",
"test setting TestSetting(batch_size=1, sequence_length=128, test_cases=100, test_times=1, contiguous=None, use_gpu=False, warmup=True, omp_num_threads=None, omp_wait_policy=None, intra_op_num_threads=None, seed=3, verbose=False, inclusive=False, extra_latency=True)\n",
"Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=12,batch_size=1,sequence_length=128,test_cases=100,test_times=1,use_gpu=False\n",
"Average latency = 54.26 ms, Throughput = 18.43 QPS\n",
"Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=11,batch_size=1,sequence_length=128,test_cases=100,test_times=1,use_gpu=False\n",
"Average latency = 55.80 ms, Throughput = 17.92 QPS\n",
"Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=10,batch_size=1,sequence_length=128,test_cases=100,test_times=1,use_gpu=False\n",
"Average latency = 65.31 ms, Throughput = 15.31 QPS\n",
"Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=9,batch_size=1,sequence_length=128,test_cases=100,test_times=1,use_gpu=False\n",
"Average latency = 57.66 ms, Throughput = 17.34 QPS\n",
"Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=8,batch_size=1,sequence_length=128,test_cases=100,test_times=1,use_gpu=False\n",
"Average latency = 62.84 ms, Throughput = 15.91 QPS\n",
"Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=7,batch_size=1,sequence_length=128,test_cases=100,test_times=1,use_gpu=False\n",
"Average latency = 69.29 ms, Throughput = 14.43 QPS\n",
"Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=6,batch_size=1,sequence_length=128,test_cases=100,test_times=1,use_gpu=False\n",
"Average latency = 56.19 ms, Throughput = 17.80 QPS\n",
"Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=5,batch_size=1,sequence_length=128,test_cases=100,test_times=1,use_gpu=False\n",
"Average latency = 59.90 ms, Throughput = 16.70 QPS\n",
"Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=4,batch_size=1,sequence_length=128,test_cases=100,test_times=1,use_gpu=False\n",
"Average latency = 63.72 ms, Throughput = 15.69 QPS\n",
"Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=3,batch_size=1,sequence_length=128,test_cases=100,test_times=1,use_gpu=False\n",
"Average latency = 82.44 ms, Throughput = 12.13 QPS\n",
"Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=2,batch_size=1,sequence_length=128,test_cases=100,test_times=1,use_gpu=False\n",
"Average latency = 119.64 ms, Throughput = 8.36 QPS\n",
"Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,batch_size=1,sequence_length=128,test_cases=100,test_times=1,use_gpu=False\n",
"Average latency = 223.21 ms, Throughput = 4.48 QPS\n",
"test setting TestSetting(batch_size=1, sequence_length=128, test_cases=100, test_times=1, use_gpu=False, intra_op_num_threads=None, seed=3, verbose=False)\n",
"Generating 100 samples for batch_size=1 sequence_length=128\n",
"Test summary is saved to ..\\onnx_models\\perf_results_CPU_B1_S128_20200806-173045.txt\n"
"Test summary is saved to ..\\onnx_models\\perf_results_CPU_B1_S128_20210713-144140.txt\n"
]
}
],
"source": [
"!{sys.executable} -m onnxruntime_tools.transformers.bert_perf_test --model $optimized_model_path --batch_size 1 --sequence_length 128 --samples 100 --test_times 1 --inclusive --all"
"!{sys.executable} -m onnxruntime.transformers.bert_perf_test --model $optimized_model_path --batch_size 1 --sequence_length 128 --samples 100 --test_times 1"
]
},
{
@ -641,14 +575,14 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"..\\onnx_models\\perf_results_CPU_B1_S128_20200806-173045.txt\n"
"..\\onnx_models\\perf_results_CPU_B1_S128_20210713-144140.txt\n"
]
},
{
@ -678,155 +612,116 @@
" <th>Latency_P99</th>\n",
" <th>Throughput(QPS)</th>\n",
" <th>intra_op_num_threads</th>\n",
" <th>OMP_NUM_THREADS</th>\n",
" <th>OMP_WAIT_POLICY</th>\n",
" <th>contiguous</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>73.52</td>\n",
" <td>75.78</td>\n",
" <td>78.21</td>\n",
" <td>89.29</td>\n",
" <td>13.60</td>\n",
" <td>1</td>\n",
" <td>54.26</td>\n",
" <td>56.05</td>\n",
" <td>60.32</td>\n",
" <td>109.21</td>\n",
" <td>18.43</td>\n",
" <td>12</td>\n",
" <td>PASSIVE</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>77.78</td>\n",
" <td>82.35</td>\n",
" <td>87.02</td>\n",
" <td>104.54</td>\n",
" <td>12.86</td>\n",
" <td>1</td>\n",
" <td>12</td>\n",
" <td>ACTIVE</td>\n",
" <td>None</td>\n",
" <td>55.80</td>\n",
" <td>56.74</td>\n",
" <td>59.67</td>\n",
" <td>73.62</td>\n",
" <td>17.92</td>\n",
" <td>11</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>78.49</td>\n",
" <td>80.92</td>\n",
" <td>85.77</td>\n",
" <td>98.98</td>\n",
" <td>12.74</td>\n",
" <td>1</td>\n",
" <td>56.19</td>\n",
" <td>61.29</td>\n",
" <td>71.69</td>\n",
" <td>80.15</td>\n",
" <td>17.80</td>\n",
" <td>6</td>\n",
" <td>PASSIVE</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>78.56</td>\n",
" <td>82.29</td>\n",
" <td>93.46</td>\n",
" <td>108.73</td>\n",
" <td>12.73</td>\n",
" <td>1</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td>None</td>\n",
" <td>57.66</td>\n",
" <td>58.50</td>\n",
" <td>61.96</td>\n",
" <td>65.12</td>\n",
" <td>17.34</td>\n",
" <td>9</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>79.17</td>\n",
" <td>82.02</td>\n",
" <td>87.60</td>\n",
" <td>99.55</td>\n",
" <td>12.63</td>\n",
" <td>6</td>\n",
" <td>6</td>\n",
" <td>PASSIVE</td>\n",
" <td>None</td>\n",
" <td>59.90</td>\n",
" <td>59.72</td>\n",
" <td>65.16</td>\n",
" <td>116.16</td>\n",
" <td>16.70</td>\n",
" <td>5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>80.08</td>\n",
" <td>83.18</td>\n",
" <td>95.60</td>\n",
" <td>107.72</td>\n",
" <td>12.49</td>\n",
" <td>0</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td>None</td>\n",
" <td>62.84</td>\n",
" <td>67.05</td>\n",
" <td>69.07</td>\n",
" <td>75.99</td>\n",
" <td>15.91</td>\n",
" <td>8</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>95.36</td>\n",
" <td>101.25</td>\n",
" <td>103.61</td>\n",
" <td>105.15</td>\n",
" <td>10.49</td>\n",
" <td>1</td>\n",
" <td>6</td>\n",
" <td>ACTIVE</td>\n",
" <td>None</td>\n",
" <td>63.72</td>\n",
" <td>64.17</td>\n",
" <td>69.44</td>\n",
" <td>73.10</td>\n",
" <td>15.69</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>95.49</td>\n",
" <td>101.50</td>\n",
" <td>102.66</td>\n",
" <td>104.82</td>\n",
" <td>10.47</td>\n",
" <td>6</td>\n",
" <td>6</td>\n",
" <td>ACTIVE</td>\n",
" <td>None</td>\n",
" <td>65.31</td>\n",
" <td>65.35</td>\n",
" <td>80.70</td>\n",
" <td>177.94</td>\n",
" <td>15.31</td>\n",
" <td>10</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>288.37</td>\n",
" <td>290.48</td>\n",
" <td>295.37</td>\n",
" <td>308.91</td>\n",
" <td>3.47</td>\n",
" <td>6</td>\n",
" <td>1</td>\n",
" <td>PASSIVE</td>\n",
" <td>None</td>\n",
" <td>69.29</td>\n",
" <td>69.04</td>\n",
" <td>70.68</td>\n",
" <td>85.03</td>\n",
" <td>14.43</td>\n",
" <td>7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>288.61</td>\n",
" <td>291.10</td>\n",
" <td>295.78</td>\n",
" <td>301.52</td>\n",
" <td>3.46</td>\n",
" <td>12</td>\n",
" <td>1</td>\n",
" <td>PASSIVE</td>\n",
" <td>None</td>\n",
" <td>82.44</td>\n",
" <td>83.20</td>\n",
" <td>89.64</td>\n",
" <td>98.80</td>\n",
" <td>12.13</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>288.71</td>\n",
" <td>292.64</td>\n",
" <td>298.28</td>\n",
" <td>305.92</td>\n",
" <td>3.46</td>\n",
" <td>12</td>\n",
" <td>1</td>\n",
" <td>ACTIVE</td>\n",
" <td>None</td>\n",
" <td>119.64</td>\n",
" <td>119.07</td>\n",
" <td>122.62</td>\n",
" <td>135.67</td>\n",
" <td>8.36</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>288.97</td>\n",
" <td>291.18</td>\n",
" <td>297.68</td>\n",
" <td>309.30</td>\n",
" <td>3.46</td>\n",
" <td>6</td>\n",
" <td>223.21</td>\n",
" <td>223.22</td>\n",
" <td>226.83</td>\n",
" <td>249.08</td>\n",
" <td>4.48</td>\n",
" <td>1</td>\n",
" <td>ACTIVE</td>\n",
" <td>None</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
@ -834,35 +729,35 @@
],
"text/plain": [
" Latency(ms) Latency_P75 Latency_P90 Latency_P99 Throughput(QPS) \\\n",
"0 73.52 75.78 78.21 89.29 13.60 \n",
"1 77.78 82.35 87.02 104.54 12.86 \n",
"2 78.49 80.92 85.77 98.98 12.74 \n",
"3 78.56 82.29 93.46 108.73 12.73 \n",
"4 79.17 82.02 87.60 99.55 12.63 \n",
"5 80.08 83.18 95.60 107.72 12.49 \n",
"6 95.36 101.25 103.61 105.15 10.49 \n",
"7 95.49 101.50 102.66 104.82 10.47 \n",
"8 288.37 290.48 295.37 308.91 3.47 \n",
"9 288.61 291.10 295.78 301.52 3.46 \n",
"10 288.71 292.64 298.28 305.92 3.46 \n",
"11 288.97 291.18 297.68 309.30 3.46 \n",
"0 54.26 56.05 60.32 109.21 18.43 \n",
"1 55.80 56.74 59.67 73.62 17.92 \n",
"2 56.19 61.29 71.69 80.15 17.80 \n",
"3 57.66 58.50 61.96 65.12 17.34 \n",
"4 59.90 59.72 65.16 116.16 16.70 \n",
"5 62.84 67.05 69.07 75.99 15.91 \n",
"6 63.72 64.17 69.44 73.10 15.69 \n",
"7 65.31 65.35 80.70 177.94 15.31 \n",
"8 69.29 69.04 70.68 85.03 14.43 \n",
"9 82.44 83.20 89.64 98.80 12.13 \n",
"10 119.64 119.07 122.62 135.67 8.36 \n",
"11 223.21 223.22 226.83 249.08 4.48 \n",
"\n",
" intra_op_num_threads OMP_NUM_THREADS OMP_WAIT_POLICY contiguous \n",
"0 1 12 PASSIVE None \n",
"1 1 12 ACTIVE None \n",
"2 1 6 PASSIVE None \n",
"3 1 None \n",
"4 6 6 PASSIVE None \n",
"5 0 None \n",
"6 1 6 ACTIVE None \n",
"7 6 6 ACTIVE None \n",
"8 6 1 PASSIVE None \n",
"9 12 1 PASSIVE None \n",
"10 12 1 ACTIVE None \n",
"11 6 1 ACTIVE None "
" intra_op_num_threads \n",
"0 12 \n",
"1 11 \n",
"2 6 \n",
"3 9 \n",
"4 5 \n",
"5 8 \n",
"6 4 \n",
"7 10 \n",
"8 7 \n",
"9 3 \n",
"10 2 \n",
"11 1 "
]
},
"execution_count": 14,
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
@ -876,7 +771,7 @@
"print(latest_result_file)\n",
"\n",
"# Remove some columns that have same values for all rows.\n",
"columns_to_remove = ['model', 'graph_optimization_level', 'batch_size', 'sequence_length', 'test_cases', 'test_times', 'use_gpu', 'warmup']\n",
"columns_to_remove = ['model', 'graph_optimization_level', 'batch_size', 'sequence_length', 'test_cases', 'test_times', 'use_gpu']\n",
"# Hide some latency percentile columns to fit screen width.\n",
"columns_to_remove.extend(['Latency_P50', 'Latency_P95'])\n",
"result_data.drop(columns_to_remove, axis=1, inplace=True)\n",
@ -901,7 +796,7 @@
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": 14,
"metadata": {},
"outputs": [
{
@ -910,12 +805,12 @@
"text": [
"{\n",
" \"gpu\": {\n",
" \"driver_version\": \"442.23\",\n",
" \"driver_version\": \"470.14\",\n",
" \"devices\": [\n",
" {\n",
" \"memory_total\": 8589934592,\n",
" \"memory_available\": 6997721088,\n",
" \"name\": \"GeForce GTX 1070\"\n",
" \"memory_available\": 6782619648,\n",
" \"name\": \"NVIDIA GeForce GTX 1070\"\n",
" }\n",
" ]\n",
" },\n",
@ -925,22 +820,98 @@
" \"logical_cores\": 12,\n",
" \"hz\": \"3.1920 GHz\",\n",
" \"l2_cache\": \"1536 KB\",\n",
" \"l3_cache\": \"12288 KB\",\n",
" \"flags\": [\n",
" \"3dnow\",\n",
" \"3dnowprefetch\",\n",
" \"abm\",\n",
" \"acpi\",\n",
" \"adx\",\n",
" \"aes\",\n",
" \"apic\",\n",
" \"avx\",\n",
" \"avx2\",\n",
" \"bmi1\",\n",
" \"bmi2\",\n",
" \"clflush\",\n",
" \"clflushopt\",\n",
" \"cmov\",\n",
" \"cx16\",\n",
" \"cx8\",\n",
" \"de\",\n",
" \"dtes64\",\n",
" \"dts\",\n",
" \"erms\",\n",
" \"est\",\n",
" \"f16c\",\n",
" \"fma\",\n",
" \"fpu\",\n",
" \"fxsr\",\n",
" \"hle\",\n",
" \"ht\",\n",
" \"hypervisor\",\n",
" \"ia64\",\n",
" \"invpcid\",\n",
" \"lahf_lm\",\n",
" \"mca\",\n",
" \"mce\",\n",
" \"mmx\",\n",
" \"movbe\",\n",
" \"mpx\",\n",
" \"msr\",\n",
" \"mtrr\",\n",
" \"osxsave\",\n",
" \"pae\",\n",
" \"pat\",\n",
" \"pbe\",\n",
" \"pcid\",\n",
" \"pclmulqdq\",\n",
" \"pdcm\",\n",
" \"pge\",\n",
" \"pni\",\n",
" \"popcnt\",\n",
" \"pse\",\n",
" \"pse36\",\n",
" \"rdrnd\",\n",
" \"rdseed\",\n",
" \"rtm\",\n",
" \"sep\",\n",
" \"serial\",\n",
" \"sgx\",\n",
" \"sgx_lc\",\n",
" \"smap\",\n",
" \"smep\",\n",
" \"ss\",\n",
" \"sse\",\n",
" \"sse2\",\n",
" \"sse4_1\",\n",
" \"sse4_2\",\n",
" \"ssse3\",\n",
" \"tm\",\n",
" \"tm2\",\n",
" \"tsc\",\n",
" \"tscdeadline\",\n",
" \"vme\",\n",
" \"x2apic\",\n",
" \"xsave\",\n",
" \"xtpr\"\n",
" ],\n",
" \"processor\": \"Intel64 Family 6 Model 158 Stepping 10, GenuineIntel\"\n",
" },\n",
" \"memory\": {\n",
" \"total\": 16971276288,\n",
" \"available\": 4723568640\n",
" \"total\": 16977195008,\n",
" \"available\": 6085459968\n",
" },\n",
" \"python\": \"3.6.10.final.0 (64 bit)\",\n",
" \"os\": \"Windows-10-10.0.19041-SP0\",\n",
" \"os\": \"Windows-10-10.0.21390-SP0\",\n",
" \"onnxruntime\": {\n",
" \"version\": \"1.4.0\",\n",
" \"version\": \"1.8.1\",\n",
" \"support_gpu\": false\n",
" },\n",
" \"onnxruntime_tools\": null,\n",
" \"pytorch\": {\n",
" \"version\": \"1.6.0+cpu\",\n",
" \"support_gpu\": false\n",
" \"version\": \"1.9.0+cpu\",\n",
" \"support_gpu\": false,\n",
" \"cuda\": null\n",
" },\n",
" \"tensorflow\": {\n",
" \"version\": \"2.3.0\",\n",
@ -954,20 +925,14 @@
"name": "stderr",
"output_type": "stream",
"text": [
"2020-08-06 17:30:50.400838: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library cudart64_101.dll\n"
"2021-07-13 14:41:45.376756: W tensorflow/stream_executor/platform/default/dso_loader.cc:59] Could not load dynamic library 'cudart64_101.dll'; dlerror: cudart64_101.dll not found\n",
"2021-07-13 14:41:45.376780: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.\n"
]
}
],
"source": [
"!{sys.executable} -m onnxruntime_tools.transformers.machine_info --silent"
"!{sys.executable} -m onnxruntime.transformers.machine_info --silent"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {