From dfe42e185c6c6de68177db8ecf307645ce831aec Mon Sep 17 00:00:00 2001 From: Tianlei Wu Date: Mon, 19 Jul 2021 14:16:59 -0700 Subject: [PATCH] update bert notebook to use onnxruntime 1.8.1 (#8379) --- .../PyTorch_Bert-Squad_OnnxRuntime_CPU.ipynb | 565 +++++------ .../PyTorch_Bert-Squad_OnnxRuntime_GPU.ipynb | 934 ++++++++---------- 2 files changed, 685 insertions(+), 814 deletions(-) diff --git a/onnxruntime/python/tools/transformers/notebooks/PyTorch_Bert-Squad_OnnxRuntime_CPU.ipynb b/onnxruntime/python/tools/transformers/notebooks/PyTorch_Bert-Squad_OnnxRuntime_CPU.ipynb index 435fd8a3a2..1cb36fab0b 100644 --- a/onnxruntime/python/tools/transformers/notebooks/PyTorch_Bert-Squad_OnnxRuntime_CPU.ipynb +++ b/onnxruntime/python/tools/transformers/notebooks/PyTorch_Bert-Squad_OnnxRuntime_CPU.ipynb @@ -47,76 +47,23 @@ "cell_type": "code", "execution_count": 1, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Looking in links: https://download.pytorch.org/whl/torch_stable.html\n", - "Requirement already up-to-date: torch==1.6.0+cpu in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (1.6.0+cpu)\n", - "Requirement already up-to-date: torchvision==0.7.0+cpu in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (0.7.0+cpu)\n", - "Requirement already satisfied, skipping upgrade: numpy in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from torch==1.6.0+cpu) (1.18.1)\n", - "Requirement already satisfied, skipping upgrade: future in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from torch==1.6.0+cpu) (0.18.2)\n", - "Requirement already satisfied, skipping upgrade: pillow>=4.1.1 in c:\\users\\tianl\\appdata\\roaming\\python\\python36\\site-packages (from torchvision==0.7.0+cpu) (7.0.0)\n", - "Requirement already up-to-date: onnxruntime==1.4.0 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (1.4.0)\n", - "Requirement already satisfied, skipping upgrade: numpy>=1.16.6 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from onnxruntime==1.4.0) (1.18.1)\n", - "Requirement already satisfied, skipping upgrade: protobuf in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from onnxruntime==1.4.0) (3.11.3)\n", - "Requirement already satisfied, skipping upgrade: six>=1.9 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from protobuf->onnxruntime==1.4.0) (1.14.0)\n", - "Requirement already satisfied, skipping upgrade: setuptools in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from protobuf->onnxruntime==1.4.0) (45.2.0.post20200210)\n", - "Requirement already up-to-date: onnxruntime-tools in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (1.4.1)\n", - "Requirement already satisfied, skipping upgrade: py-cpuinfo in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from onnxruntime-tools) (5.0.0)\n", - "Requirement already satisfied, skipping upgrade: packaging in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from onnxruntime-tools) (20.1)\n", - "Requirement already satisfied, skipping upgrade: coloredlogs in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from onnxruntime-tools) (14.0)\n", - "Requirement already satisfied, skipping upgrade: py3nvml in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from onnxruntime-tools) (0.2.5)\n", - "Requirement already satisfied, skipping upgrade: numpy in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from onnxruntime-tools) (1.18.1)\n", - "Requirement already satisfied, skipping upgrade: psutil in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from onnxruntime-tools) (5.7.0)\n", - "Requirement already satisfied, skipping upgrade: onnx in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from onnxruntime-tools) (1.7.0)\n", - "Requirement already satisfied, skipping upgrade: pyparsing>=2.0.2 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from packaging->onnxruntime-tools) (2.4.6)\n", - "Requirement already satisfied, skipping upgrade: six in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from packaging->onnxruntime-tools) (1.14.0)\n", - "Requirement already satisfied, skipping upgrade: humanfriendly>=7.1 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from coloredlogs->onnxruntime-tools) (8.1)\n", - "Requirement already satisfied, skipping upgrade: xmltodict in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from py3nvml->onnxruntime-tools) (0.12.0)\n", - "Requirement already satisfied, skipping upgrade: protobuf in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from onnx->onnxruntime-tools) (3.11.3)\n", - "Requirement already satisfied, skipping upgrade: typing-extensions>=3.6.2.1 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from onnx->onnxruntime-tools) (3.7.4.1)\n", - "Requirement already satisfied, skipping upgrade: pyreadline; sys_platform == \"win32\" in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from humanfriendly>=7.1->coloredlogs->onnxruntime-tools) (2.1)\n", - "Requirement already satisfied, skipping upgrade: setuptools in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from protobuf->onnx->onnxruntime-tools) (45.2.0.post20200210)\n", - "Requirement already satisfied: transformers==3.0.2 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (3.0.2)\n", - "Requirement already satisfied: packaging in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from transformers==3.0.2) (20.1)\n", - "Requirement already satisfied: sentencepiece!=0.1.92 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from transformers==3.0.2) (0.1.85)\n", - "Requirement already satisfied: sacremoses in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from transformers==3.0.2) (0.0.38)\n", - "Requirement already satisfied: dataclasses; python_version < \"3.7\" in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from transformers==3.0.2) (0.7)\n", - "Requirement already satisfied: requests in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from transformers==3.0.2) (2.23.0)\n", - "Requirement already satisfied: tokenizers==0.8.1.rc1 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from transformers==3.0.2) (0.8.1rc1)\n", - "Requirement already satisfied: tqdm>=4.27 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from transformers==3.0.2) (4.43.0)\n", - "Requirement already satisfied: regex!=2019.12.17 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from transformers==3.0.2) (2020.2.20)\n", - "Requirement already satisfied: filelock in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from transformers==3.0.2) (3.0.12)\n", - "Requirement already satisfied: numpy in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from transformers==3.0.2) (1.18.1)\n", - "Requirement already satisfied: pyparsing>=2.0.2 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from packaging->transformers==3.0.2) (2.4.6)\n", - "Requirement already satisfied: six in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from packaging->transformers==3.0.2) (1.14.0)\n", - "Requirement already satisfied: click in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from sacremoses->transformers==3.0.2) (7.0)\n", - "Requirement already satisfied: joblib in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from sacremoses->transformers==3.0.2) (0.14.1)\n", - "Requirement already satisfied: certifi>=2017.4.17 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from requests->transformers==3.0.2) (2020.4.5.1)\n", - "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from requests->transformers==3.0.2) (1.25.8)\n", - "Requirement already satisfied: idna<3,>=2.5 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from requests->transformers==3.0.2) (2.9)\n", - "Requirement already satisfied: chardet<4,>=3.0.2 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from requests->transformers==3.0.2) (3.0.4)\n", - "Requirement already satisfied: wget in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (3.2)\n", - "Requirement already satisfied: netron in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (3.9.6)\n" - ] - } - ], + "outputs": [], "source": [ - "# Install PyTorch 1.6.0 and OnnxRuntime 1.4.0 for CPU-only.\n", "import sys\n", - "if sys.platform == 'darwin': # Mac\n", - " !{sys.executable} -m pip install --upgrade torch torchvision\n", - "else:\n", - " !{sys.executable} -m pip install --upgrade torch==1.6.0+cpu torchvision==0.7.0+cpu -f https://download.pytorch.org/whl/torch_stable.html\n", - "!{sys.executable} -m pip install --upgrade onnxruntime==1.4.0\n", - "!{sys.executable} -m pip install --upgrade onnxconverter_common\n", - "!{sys.executable} -m pip install --upgrade onnxruntime-tools\n", "\n", - "# Install other packages used in this notebook.\n", - "!{sys.executable} -m pip install transformers==3.0.2\n", - "!{sys.executable} -m pip install wget netron" + "run_install = False # Only need install once\n", + "if run_install:\n", + " if sys.platform in ['linux', 'win32']: # Linux or Windows\n", + " !{sys.executable} -m pip install --upgrade torch torchvision torchaudio\n", + " else: # Mac\n", + " !{sys.executable} -m pip install torch==1.9.0+cpu torchvision==0.10.0+cpu torchaudio==0.9.0 -f https://download.pytorch.org/whl/torch_stable.html\n", + "\n", + " !{sys.executable} -m pip install onnxruntime==1.8.1 onnx==1.9.0 onnxconverter_common==1.8.1\n", + "\n", + " # Install other packages used in this notebook.\n", + " !{sys.executable} -m pip install transformers==4.8.2\n", + " !{sys.executable} -m pip install psutil pytz pandas py-cpuinfo py3nvml\n", + " !{sys.executable} -m pip install wget netron" ] }, { @@ -196,14 +143,14 @@ "name": "stderr", "output_type": "stream", "text": [ - "Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForQuestionAnswering: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']\n", - "- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).\n", + "Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForQuestionAnswering: ['cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']\n", + "- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", "- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n", "Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']\n", "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n", - "100%|██████████████████████████████████████████████████████████████████████████████████| 48/48 [00:03<00:00, 12.87it/s]\n", - "convert squad examples to features: 100%|███████████████████████████████████████████| 100/100 [00:00<00:00, 131.41it/s]\n", - "add example index and unique id: 100%|████████████████████████████████████████████| 100/100 [00:00<00:00, 96776.74it/s]\n" + "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 48/48 [00:03<00:00, 12.15it/s]\n", + "convert squad examples to features: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 135.87it/s]\n", + "add example index and unique id: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 100031.10it/s]\n" ] } ], @@ -252,6 +199,14 @@ "execution_count": 5, "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "d:\\git\\transformers\\src\\transformers\\modeling_utils.py:2074: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n", + " input_tensor.shape[chunk_dim] == tensor_shape for input_tensor in input_tensors\n" + ] + }, { "name": "stdout", "output_type": "stream", @@ -319,7 +274,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "PyTorch cpu Inference time = 144.73 ms\n" + "PyTorch cpu Inference time = 119.80 ms\n" ] } ], @@ -348,45 +303,26 @@ "source": [ "## 4. Inference ONNX Model with ONNX Runtime ##\n", "\n", - "### OpenMP Environment Variable\n", - "\n", - "OpenMP environment variables are very important for CPU inference of Bert model. It has large performance impact on Bert model so you might need set it carefully according to [Performance Test Tool](#Performance-Test-Tool) result in later part of this notebook.\n", - "\n", - "Setting environment variables shall be done before importing onnxruntime. Otherwise, they might not take effect." - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "import psutil\n", - "\n", - "# You may change the settings in this cell according to Performance Test Tool result.\n", - "os.environ[\"OMP_NUM_THREADS\"] = str(psutil.cpu_count(logical=True))\n", - "os.environ[\"OMP_WAIT_POLICY\"] = 'ACTIVE'" + "For Onnx Runtime 1.6.0 or older, OpenMP environment variables are very important for CPU inference of Bert model. Since 1.7.0, the official package is not built with OpenMP." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Now we are ready to inference the model with ONNX Runtime. Here we can see that OnnxRuntime has better performance than PyTorch. \n", - "\n", - "It is better to use standalone python script like [Performance Test tool](#Performance-Test-tool) to get accurate performance results." + "Now we inference the model with ONNX Runtime. Here we can see that OnnxRuntime has better performance than PyTorch. " ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "OnnxRuntime cpu Inference time = 88.55 ms\n" + "OnnxRuntime cpu Inference time = 72.46 ms\n" ] } ], @@ -394,19 +330,15 @@ "import onnxruntime\n", "import numpy\n", "\n", - "# Print warning if user uses onnxruntime-gpu instead of onnxruntime package.\n", - "if 'CUDAExecutionProvider' in onnxruntime.get_available_providers():\n", - " print(\"warning: onnxruntime-gpu is not built with OpenMP. You might try onnxruntime package to test CPU inference.\")\n", - "\n", "sess_options = onnxruntime.SessionOptions()\n", "\n", "# Optional: store the optimized graph and view it using Netron to verify that model is fully optimized.\n", "# Note that this will increase session creation time, so it is for debugging only.\n", "sess_options.optimized_model_filepath = os.path.join(output_dir, \"optimized_model_cpu.onnx\")\n", "\n", - "# For OnnxRuntime 1.2.0, you might need set intra_op_num_threads to 1 to enable OpenMP\n", - "# sess_options.intra_op_num_threads=1\n", - "# For OnnxRuntime 1.3.0 or later, it is recommended to use the default setting so you need not set it.\n", + "# For OnnxRuntime 1.7.0 or later, you can set intra_op_num_threads to set thread number like\n", + "# sess_options.intra_op_num_threads=4\n", + "# Here we use the default value which is a good choice in most cases.\n", "\n", "# Specify providers when you use onnxruntime-gpu for CPU inference.\n", "session = onnxruntime.InferenceSession(export_model_path, sess_options, providers=['CPUExecutionProvider'])\n", @@ -427,7 +359,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -474,17 +406,17 @@ "\n", "Example Usage:\n", "```\n", - "from onnxruntime_tools import optimizer\n", + "from onnxruntime.transformers import optimizer\n", "optimized_model = optimizer.optimize_model(export_model_path, model_type='bert', num_heads=12, hidden_size=768)\n", "optimized_model.save_model_to_file(optimized_model_path)\n", "```\n", "\n", - "You can also use optimizer_cli like the following:" + "You can also use command line like the following:" ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -493,15 +425,17 @@ "text": [ " apply: Fused LayerNormalization count: 25\n", " apply: Fused Gelu count: 12\n", - " apply: Fused SkipLayerNormalization count: 25\n", + "adjust_reshape_and_expand: Removed Reshape and Expand count: 0\n", + " apply: Fused SkipLayerNormalization count: 24\n", " apply: Fused Attention count: 12\n", " prune_graph: Graph pruned: 0 inputs, 0 outputs and 5 nodes are removed\n", " apply: Fused EmbedLayerNormalization(with mask) count: 1\n", - " prune_graph: Graph pruned: 0 inputs, 0 outputs and 12 nodes are removed\n", + " prune_graph: Graph pruned: 0 inputs, 0 outputs and 3 nodes are removed\n", " prune_graph: Graph pruned: 0 inputs, 0 outputs and 0 nodes are removed\n", " apply: Fused BiasGelu count: 12\n", " apply: Fused SkipLayerNormalization(add bias) count: 24\n", " optimize: opset verion: 11\n", + " save_model_to_file: Sort graphs in topological order\n", " save_model_to_file: Output model to ..\\onnx_models\\bert-base-cased-squad_opt_cpu.onnx\n", "get_fused_operator_statistics: Optimized operators:{'EmbedLayerNormalization': 1, 'Attention': 12, 'Gelu': 0, 'FastGelu': 0, 'BiasGelu': 12, 'LayerNormalization': 0, 'SkipLayerNormalization': 24}\n", " main: The model has been fully optimized.\n" @@ -511,7 +445,7 @@ "source": [ "optimized_model_path = os.path.join(output_dir, 'bert-base-cased-squad_opt_cpu.onnx')\n", "\n", - "!{sys.executable} -m onnxruntime_tools.optimizer_cli --input $export_model_path --output $optimized_model_path --model_type bert --num_heads 12 --hidden_size 768" + "!{sys.executable} -m onnxruntime.transformers.optimizer --input $export_model_path --output $optimized_model_path --model_type bert --num_heads 12 --hidden_size 768" ] }, { @@ -527,7 +461,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -561,7 +495,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -569,13 +503,13 @@ "output_type": "stream", "text": [ "100% passed for 100 random inputs given thresholds (rtol=0.001, atol=0.0001).\n", - "maximum absolute difference=5.930662155151367e-06\n", - "maximum relative difference=0.021568937227129936\n" + "maximum absolute difference=4.604458808898926e-06\n", + "maximum relative difference=0.006278202868998051\n" ] } ], "source": [ - "!{sys.executable} -m onnxruntime_tools.transformers.compare_bert_results --baseline_model $export_model_path --optimized_model $optimized_model_path --batch_size 1 --sequence_length 128 --samples 100" + "!{sys.executable} -m onnxruntime.transformers.compare_bert_results --baseline_model $export_model_path --optimized_model $optimized_model_path --batch_size 1 --sequence_length 128 --samples 100" ] }, { @@ -591,45 +525,45 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=0,OMP_NUM_THREADS=,OMP_WAIT_POLICY=,batch_size=1,sequence_length=128,test_cases=100,test_times=1,contiguous=None,use_gpu=False,warmup=True\n", - "Average latency = 80.08 ms, Throughput = 12.49 QPS\n", - "Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=,OMP_WAIT_POLICY=,batch_size=1,sequence_length=128,test_cases=100,test_times=1,contiguous=None,use_gpu=False,warmup=True\n", - "Average latency = 78.56 ms, Throughput = 12.73 QPS\n", - "Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=ACTIVE,batch_size=1,sequence_length=128,test_cases=100,test_times=1,contiguous=None,use_gpu=False,warmup=True\n", - "Average latency = 77.78 ms, Throughput = 12.86 QPS\n", - "Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=PASSIVE,batch_size=1,sequence_length=128,test_cases=100,test_times=1,contiguous=None,use_gpu=False,warmup=True\n", - "Average latency = 73.52 ms, Throughput = 13.60 QPS\n", - "Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=6,OMP_WAIT_POLICY=ACTIVE,batch_size=1,sequence_length=128,test_cases=100,test_times=1,contiguous=None,use_gpu=False,warmup=True\n", - "Average latency = 95.36 ms, Throughput = 10.49 QPS\n", - "Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=6,OMP_WAIT_POLICY=PASSIVE,batch_size=1,sequence_length=128,test_cases=100,test_times=1,contiguous=None,use_gpu=False,warmup=True\n", - "Average latency = 78.49 ms, Throughput = 12.74 QPS\n", - "Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=12,OMP_NUM_THREADS=1,OMP_WAIT_POLICY=ACTIVE,batch_size=1,sequence_length=128,test_cases=100,test_times=1,contiguous=None,use_gpu=False,warmup=True\n", - "Average latency = 288.71 ms, Throughput = 3.46 QPS\n", - "Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=12,OMP_NUM_THREADS=1,OMP_WAIT_POLICY=PASSIVE,batch_size=1,sequence_length=128,test_cases=100,test_times=1,contiguous=None,use_gpu=False,warmup=True\n", - "Average latency = 288.61 ms, Throughput = 3.46 QPS\n", - "Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=6,OMP_NUM_THREADS=1,OMP_WAIT_POLICY=ACTIVE,batch_size=1,sequence_length=128,test_cases=100,test_times=1,contiguous=None,use_gpu=False,warmup=True\n", - "Average latency = 288.97 ms, Throughput = 3.46 QPS\n", - "Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=6,OMP_NUM_THREADS=1,OMP_WAIT_POLICY=PASSIVE,batch_size=1,sequence_length=128,test_cases=100,test_times=1,contiguous=None,use_gpu=False,warmup=True\n", - "Average latency = 288.37 ms, Throughput = 3.47 QPS\n", - "Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=6,OMP_NUM_THREADS=6,OMP_WAIT_POLICY=ACTIVE,batch_size=1,sequence_length=128,test_cases=100,test_times=1,contiguous=None,use_gpu=False,warmup=True\n", - "Average latency = 95.49 ms, Throughput = 10.47 QPS\n", - "Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=6,OMP_NUM_THREADS=6,OMP_WAIT_POLICY=PASSIVE,batch_size=1,sequence_length=128,test_cases=100,test_times=1,contiguous=None,use_gpu=False,warmup=True\n", - "Average latency = 79.17 ms, Throughput = 12.63 QPS\n", - "test setting TestSetting(batch_size=1, sequence_length=128, test_cases=100, test_times=1, contiguous=None, use_gpu=False, warmup=True, omp_num_threads=None, omp_wait_policy=None, intra_op_num_threads=None, seed=3, verbose=False, inclusive=False, extra_latency=True)\n", + "Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=12,batch_size=1,sequence_length=128,test_cases=100,test_times=1,use_gpu=False\n", + "Average latency = 54.26 ms, Throughput = 18.43 QPS\n", + "Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=11,batch_size=1,sequence_length=128,test_cases=100,test_times=1,use_gpu=False\n", + "Average latency = 55.80 ms, Throughput = 17.92 QPS\n", + "Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=10,batch_size=1,sequence_length=128,test_cases=100,test_times=1,use_gpu=False\n", + "Average latency = 65.31 ms, Throughput = 15.31 QPS\n", + "Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=9,batch_size=1,sequence_length=128,test_cases=100,test_times=1,use_gpu=False\n", + "Average latency = 57.66 ms, Throughput = 17.34 QPS\n", + "Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=8,batch_size=1,sequence_length=128,test_cases=100,test_times=1,use_gpu=False\n", + "Average latency = 62.84 ms, Throughput = 15.91 QPS\n", + "Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=7,batch_size=1,sequence_length=128,test_cases=100,test_times=1,use_gpu=False\n", + "Average latency = 69.29 ms, Throughput = 14.43 QPS\n", + "Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=6,batch_size=1,sequence_length=128,test_cases=100,test_times=1,use_gpu=False\n", + "Average latency = 56.19 ms, Throughput = 17.80 QPS\n", + "Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=5,batch_size=1,sequence_length=128,test_cases=100,test_times=1,use_gpu=False\n", + "Average latency = 59.90 ms, Throughput = 16.70 QPS\n", + "Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=4,batch_size=1,sequence_length=128,test_cases=100,test_times=1,use_gpu=False\n", + "Average latency = 63.72 ms, Throughput = 15.69 QPS\n", + "Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=3,batch_size=1,sequence_length=128,test_cases=100,test_times=1,use_gpu=False\n", + "Average latency = 82.44 ms, Throughput = 12.13 QPS\n", + "Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=2,batch_size=1,sequence_length=128,test_cases=100,test_times=1,use_gpu=False\n", + "Average latency = 119.64 ms, Throughput = 8.36 QPS\n", + "Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,batch_size=1,sequence_length=128,test_cases=100,test_times=1,use_gpu=False\n", + "Average latency = 223.21 ms, Throughput = 4.48 QPS\n", + "test setting TestSetting(batch_size=1, sequence_length=128, test_cases=100, test_times=1, use_gpu=False, intra_op_num_threads=None, seed=3, verbose=False)\n", "Generating 100 samples for batch_size=1 sequence_length=128\n", - "Test summary is saved to ..\\onnx_models\\perf_results_CPU_B1_S128_20200806-173045.txt\n" + "Test summary is saved to ..\\onnx_models\\perf_results_CPU_B1_S128_20210713-144140.txt\n" ] } ], "source": [ - "!{sys.executable} -m onnxruntime_tools.transformers.bert_perf_test --model $optimized_model_path --batch_size 1 --sequence_length 128 --samples 100 --test_times 1 --inclusive --all" + "!{sys.executable} -m onnxruntime.transformers.bert_perf_test --model $optimized_model_path --batch_size 1 --sequence_length 128 --samples 100 --test_times 1" ] }, { @@ -641,14 +575,14 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "..\\onnx_models\\perf_results_CPU_B1_S128_20200806-173045.txt\n" + "..\\onnx_models\\perf_results_CPU_B1_S128_20210713-144140.txt\n" ] }, { @@ -678,155 +612,116 @@ " Latency_P99\n", " Throughput(QPS)\n", " intra_op_num_threads\n", - " OMP_NUM_THREADS\n", - " OMP_WAIT_POLICY\n", - " contiguous\n", " \n", " \n", " \n", " \n", " 0\n", - " 73.52\n", - " 75.78\n", - " 78.21\n", - " 89.29\n", - " 13.60\n", - " 1\n", + " 54.26\n", + " 56.05\n", + " 60.32\n", + " 109.21\n", + " 18.43\n", " 12\n", - " PASSIVE\n", - " None\n", " \n", " \n", " 1\n", - " 77.78\n", - " 82.35\n", - " 87.02\n", - " 104.54\n", - " 12.86\n", - " 1\n", - " 12\n", - " ACTIVE\n", - " None\n", + " 55.80\n", + " 56.74\n", + " 59.67\n", + " 73.62\n", + " 17.92\n", + " 11\n", " \n", " \n", " 2\n", - " 78.49\n", - " 80.92\n", - " 85.77\n", - " 98.98\n", - " 12.74\n", - " 1\n", + " 56.19\n", + " 61.29\n", + " 71.69\n", + " 80.15\n", + " 17.80\n", " 6\n", - " PASSIVE\n", - " None\n", " \n", " \n", " 3\n", - " 78.56\n", - " 82.29\n", - " 93.46\n", - " 108.73\n", - " 12.73\n", - " 1\n", - " \n", - " \n", - " None\n", + " 57.66\n", + " 58.50\n", + " 61.96\n", + " 65.12\n", + " 17.34\n", + " 9\n", " \n", " \n", " 4\n", - " 79.17\n", - " 82.02\n", - " 87.60\n", - " 99.55\n", - " 12.63\n", - " 6\n", - " 6\n", - " PASSIVE\n", - " None\n", + " 59.90\n", + " 59.72\n", + " 65.16\n", + " 116.16\n", + " 16.70\n", + " 5\n", " \n", " \n", " 5\n", - " 80.08\n", - " 83.18\n", - " 95.60\n", - " 107.72\n", - " 12.49\n", - " 0\n", - " \n", - " \n", - " None\n", + " 62.84\n", + " 67.05\n", + " 69.07\n", + " 75.99\n", + " 15.91\n", + " 8\n", " \n", " \n", " 6\n", - " 95.36\n", - " 101.25\n", - " 103.61\n", - " 105.15\n", - " 10.49\n", - " 1\n", - " 6\n", - " ACTIVE\n", - " None\n", + " 63.72\n", + " 64.17\n", + " 69.44\n", + " 73.10\n", + " 15.69\n", + " 4\n", " \n", " \n", " 7\n", - " 95.49\n", - " 101.50\n", - " 102.66\n", - " 104.82\n", - " 10.47\n", - " 6\n", - " 6\n", - " ACTIVE\n", - " None\n", + " 65.31\n", + " 65.35\n", + " 80.70\n", + " 177.94\n", + " 15.31\n", + " 10\n", " \n", " \n", " 8\n", - " 288.37\n", - " 290.48\n", - " 295.37\n", - " 308.91\n", - " 3.47\n", - " 6\n", - " 1\n", - " PASSIVE\n", - " None\n", + " 69.29\n", + " 69.04\n", + " 70.68\n", + " 85.03\n", + " 14.43\n", + " 7\n", " \n", " \n", " 9\n", - " 288.61\n", - " 291.10\n", - " 295.78\n", - " 301.52\n", - " 3.46\n", - " 12\n", - " 1\n", - " PASSIVE\n", - " None\n", + " 82.44\n", + " 83.20\n", + " 89.64\n", + " 98.80\n", + " 12.13\n", + " 3\n", " \n", " \n", " 10\n", - " 288.71\n", - " 292.64\n", - " 298.28\n", - " 305.92\n", - " 3.46\n", - " 12\n", - " 1\n", - " ACTIVE\n", - " None\n", + " 119.64\n", + " 119.07\n", + " 122.62\n", + " 135.67\n", + " 8.36\n", + " 2\n", " \n", " \n", " 11\n", - " 288.97\n", - " 291.18\n", - " 297.68\n", - " 309.30\n", - " 3.46\n", - " 6\n", + " 223.21\n", + " 223.22\n", + " 226.83\n", + " 249.08\n", + " 4.48\n", " 1\n", - " ACTIVE\n", - " None\n", " \n", " \n", "\n", @@ -834,35 +729,35 @@ ], "text/plain": [ " Latency(ms) Latency_P75 Latency_P90 Latency_P99 Throughput(QPS) \\\n", - "0 73.52 75.78 78.21 89.29 13.60 \n", - "1 77.78 82.35 87.02 104.54 12.86 \n", - "2 78.49 80.92 85.77 98.98 12.74 \n", - "3 78.56 82.29 93.46 108.73 12.73 \n", - "4 79.17 82.02 87.60 99.55 12.63 \n", - "5 80.08 83.18 95.60 107.72 12.49 \n", - "6 95.36 101.25 103.61 105.15 10.49 \n", - "7 95.49 101.50 102.66 104.82 10.47 \n", - "8 288.37 290.48 295.37 308.91 3.47 \n", - "9 288.61 291.10 295.78 301.52 3.46 \n", - "10 288.71 292.64 298.28 305.92 3.46 \n", - "11 288.97 291.18 297.68 309.30 3.46 \n", + "0 54.26 56.05 60.32 109.21 18.43 \n", + "1 55.80 56.74 59.67 73.62 17.92 \n", + "2 56.19 61.29 71.69 80.15 17.80 \n", + "3 57.66 58.50 61.96 65.12 17.34 \n", + "4 59.90 59.72 65.16 116.16 16.70 \n", + "5 62.84 67.05 69.07 75.99 15.91 \n", + "6 63.72 64.17 69.44 73.10 15.69 \n", + "7 65.31 65.35 80.70 177.94 15.31 \n", + "8 69.29 69.04 70.68 85.03 14.43 \n", + "9 82.44 83.20 89.64 98.80 12.13 \n", + "10 119.64 119.07 122.62 135.67 8.36 \n", + "11 223.21 223.22 226.83 249.08 4.48 \n", "\n", - " intra_op_num_threads OMP_NUM_THREADS OMP_WAIT_POLICY contiguous \n", - "0 1 12 PASSIVE None \n", - "1 1 12 ACTIVE None \n", - "2 1 6 PASSIVE None \n", - "3 1 None \n", - "4 6 6 PASSIVE None \n", - "5 0 None \n", - "6 1 6 ACTIVE None \n", - "7 6 6 ACTIVE None \n", - "8 6 1 PASSIVE None \n", - "9 12 1 PASSIVE None \n", - "10 12 1 ACTIVE None \n", - "11 6 1 ACTIVE None " + " intra_op_num_threads \n", + "0 12 \n", + "1 11 \n", + "2 6 \n", + "3 9 \n", + "4 5 \n", + "5 8 \n", + "6 4 \n", + "7 10 \n", + "8 7 \n", + "9 3 \n", + "10 2 \n", + "11 1 " ] }, - "execution_count": 14, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -876,7 +771,7 @@ "print(latest_result_file)\n", "\n", "# Remove some columns that have same values for all rows.\n", - "columns_to_remove = ['model', 'graph_optimization_level', 'batch_size', 'sequence_length', 'test_cases', 'test_times', 'use_gpu', 'warmup']\n", + "columns_to_remove = ['model', 'graph_optimization_level', 'batch_size', 'sequence_length', 'test_cases', 'test_times', 'use_gpu']\n", "# Hide some latency percentile columns to fit screen width.\n", "columns_to_remove.extend(['Latency_P50', 'Latency_P95'])\n", "result_data.drop(columns_to_remove, axis=1, inplace=True)\n", @@ -901,7 +796,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 14, "metadata": {}, "outputs": [ { @@ -910,12 +805,12 @@ "text": [ "{\n", " \"gpu\": {\n", - " \"driver_version\": \"442.23\",\n", + " \"driver_version\": \"470.14\",\n", " \"devices\": [\n", " {\n", " \"memory_total\": 8589934592,\n", - " \"memory_available\": 6997721088,\n", - " \"name\": \"GeForce GTX 1070\"\n", + " \"memory_available\": 6782619648,\n", + " \"name\": \"NVIDIA GeForce GTX 1070\"\n", " }\n", " ]\n", " },\n", @@ -925,22 +820,98 @@ " \"logical_cores\": 12,\n", " \"hz\": \"3.1920 GHz\",\n", " \"l2_cache\": \"1536 KB\",\n", - " \"l3_cache\": \"12288 KB\",\n", + " \"flags\": [\n", + " \"3dnow\",\n", + " \"3dnowprefetch\",\n", + " \"abm\",\n", + " \"acpi\",\n", + " \"adx\",\n", + " \"aes\",\n", + " \"apic\",\n", + " \"avx\",\n", + " \"avx2\",\n", + " \"bmi1\",\n", + " \"bmi2\",\n", + " \"clflush\",\n", + " \"clflushopt\",\n", + " \"cmov\",\n", + " \"cx16\",\n", + " \"cx8\",\n", + " \"de\",\n", + " \"dtes64\",\n", + " \"dts\",\n", + " \"erms\",\n", + " \"est\",\n", + " \"f16c\",\n", + " \"fma\",\n", + " \"fpu\",\n", + " \"fxsr\",\n", + " \"hle\",\n", + " \"ht\",\n", + " \"hypervisor\",\n", + " \"ia64\",\n", + " \"invpcid\",\n", + " \"lahf_lm\",\n", + " \"mca\",\n", + " \"mce\",\n", + " \"mmx\",\n", + " \"movbe\",\n", + " \"mpx\",\n", + " \"msr\",\n", + " \"mtrr\",\n", + " \"osxsave\",\n", + " \"pae\",\n", + " \"pat\",\n", + " \"pbe\",\n", + " \"pcid\",\n", + " \"pclmulqdq\",\n", + " \"pdcm\",\n", + " \"pge\",\n", + " \"pni\",\n", + " \"popcnt\",\n", + " \"pse\",\n", + " \"pse36\",\n", + " \"rdrnd\",\n", + " \"rdseed\",\n", + " \"rtm\",\n", + " \"sep\",\n", + " \"serial\",\n", + " \"sgx\",\n", + " \"sgx_lc\",\n", + " \"smap\",\n", + " \"smep\",\n", + " \"ss\",\n", + " \"sse\",\n", + " \"sse2\",\n", + " \"sse4_1\",\n", + " \"sse4_2\",\n", + " \"ssse3\",\n", + " \"tm\",\n", + " \"tm2\",\n", + " \"tsc\",\n", + " \"tscdeadline\",\n", + " \"vme\",\n", + " \"x2apic\",\n", + " \"xsave\",\n", + " \"xtpr\"\n", + " ],\n", " \"processor\": \"Intel64 Family 6 Model 158 Stepping 10, GenuineIntel\"\n", " },\n", " \"memory\": {\n", - " \"total\": 16971276288,\n", - " \"available\": 4723568640\n", + " \"total\": 16977195008,\n", + " \"available\": 6085459968\n", " },\n", " \"python\": \"3.6.10.final.0 (64 bit)\",\n", - " \"os\": \"Windows-10-10.0.19041-SP0\",\n", + " \"os\": \"Windows-10-10.0.21390-SP0\",\n", " \"onnxruntime\": {\n", - " \"version\": \"1.4.0\",\n", + " \"version\": \"1.8.1\",\n", " \"support_gpu\": false\n", " },\n", + " \"onnxruntime_tools\": null,\n", " \"pytorch\": {\n", - " \"version\": \"1.6.0+cpu\",\n", - " \"support_gpu\": false\n", + " \"version\": \"1.9.0+cpu\",\n", + " \"support_gpu\": false,\n", + " \"cuda\": null\n", " },\n", " \"tensorflow\": {\n", " \"version\": \"2.3.0\",\n", @@ -954,20 +925,14 @@ "name": "stderr", "output_type": "stream", "text": [ - "2020-08-06 17:30:50.400838: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library cudart64_101.dll\n" + "2021-07-13 14:41:45.376756: W tensorflow/stream_executor/platform/default/dso_loader.cc:59] Could not load dynamic library 'cudart64_101.dll'; dlerror: cudart64_101.dll not found\n", + "2021-07-13 14:41:45.376780: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.\n" ] } ], "source": [ - "!{sys.executable} -m onnxruntime_tools.transformers.machine_info --silent" + "!{sys.executable} -m onnxruntime.transformers.machine_info --silent" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/onnxruntime/python/tools/transformers/notebooks/PyTorch_Bert-Squad_OnnxRuntime_GPU.ipynb b/onnxruntime/python/tools/transformers/notebooks/PyTorch_Bert-Squad_OnnxRuntime_GPU.ipynb index 3b090b8232..1016aef1fe 100644 --- a/onnxruntime/python/tools/transformers/notebooks/PyTorch_Bert-Squad_OnnxRuntime_GPU.ipynb +++ b/onnxruntime/python/tools/transformers/notebooks/PyTorch_Bert-Squad_OnnxRuntime_GPU.ipynb @@ -36,17 +36,16 @@ "First, we install [AnaConda](https://www.anaconda.com/distribution/) in a target machine and open an AnaConda prompt window when it is done. Then run the following commands to create a conda environment. This notebook is tested with PyTorch 1.5.0 and OnnxRuntime 1.3.0.\n", "\n", "```console\n", - "conda create -n gpu_env python=3.7\n", + "conda create -n gpu_env python=3.6\n", "conda activate gpu_env\n", - "conda install pytorch torchvision cudatoolkit=10.1 -c pytorch\n", "conda install -c anaconda ipykernel\n", "conda install -c conda-forge ipywidgets\n", - "python -m ipykernel install --user --name=gpu_env_py37\n", + "python -m ipykernel install --user --name=gpu_env\n", "jupyter notebook\n", "```\n", - "Finally, launch Jupyter Notebook and you can choose gpu_env_py37 as kernel to run this notebook.\n", + "Finally, launch Jupyter Notebook and you can choose gpu_env as kernel to run this notebook.\n", "\n", - "Onnxruntime-gpu need specified version of CUDA and cuDNN. You can find the corresponding version in [requirements](https://github.com/microsoft/onnxruntime/tree/rel-1.3.0#system-requirements). If the version is different from above cudatoolkit version, you have to install them separately, and add their bin directories to PATH environment variable (See [CUDA and cuDNN Path](#CUDA-and-cuDNN-Path) below)." + "Onnxruntime-gpu need specified version of CUDA and cuDNN. You can find the Requirements [here]( http://www.onnxruntime.ai/docs/how-to/install.html). Remember to add the directories to PATH environment variable (See [CUDA and cuDNN Path](#CUDA-and-cuDNN-Path) below)." ] }, { @@ -56,12 +55,46 @@ "outputs": [], "source": [ "import sys\n", - "!{sys.executable} -m pip uninstall --quiet --yes onnxruntime-gpu\n", - "!{sys.executable} -m pip install --quiet onnxruntime-gpu\n", - "!{sys.executable} -m pip install --quiet --upgrade transformers\n", - "!{sys.executable} -m pip install --quiet --upgrade onnxconverter_common\n", - "!{sys.executable} -m pip install --quiet --upgrade onnxruntime-tools\n", - "!{sys.executable} -m pip install --quiet wget netron pandas" + "\n", + "run_install = False # Only need install once\n", + "if run_install:\n", + " if sys.platform in ['linux', 'win32']: # Linux or Windows\n", + " !{sys.executable} -m pip install torch==1.9.0+cu111 torchvision==0.10.0+cu111 torchaudio===0.9.0 -f https://download.pytorch.org/whl/torch_stable.html\n", + " else: # Mac\n", + " print(\"PyTorch 1.9 MacOS Binaries do not support CUDA, install from source instead\")\n", + "\n", + " !{sys.executable} -m pip install onnxruntime-gpu==1.8.1 onnx==1.9.0 onnxconverter_common==1.8.1\n", + "\n", + " # Install other packages used in this notebook.\n", + " !{sys.executable} -m pip install transformers==4.8.2\n", + " !{sys.executable} -m pip install psutil pytz pandas py-cpuinfo py3nvml coloredlogs wget netron sympy" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "pytorch: 1.9.0+cu111\n", + "onnxruntime: 1.8.1\n", + "onnx: 1.9.0\n", + "transformers: 4.8.2\n" + ] + } + ], + "source": [ + "import torch\n", + "import onnx\n", + "import onnxruntime\n", + "import transformers\n", + "print(\"pytorch:\", torch.__version__)\n", + "print(\"onnxruntime:\", onnxruntime.__version__)\n", + "print(\"onnx:\", onnx.__version__)\n", + "print(\"transformers:\", transformers.__version__)" ] }, { @@ -80,7 +113,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -108,7 +141,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -131,12 +164,12 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ - "# For fine-tuned large model, the model name is \"bert-large-uncased-whole-word-masking-finetuned-squad\". Here we use bert-base for demo.\n", - "model_name_or_path = \"bert-base-cased\"\n", + "# fine-tuned model from https://huggingface.co/models?search=squad\n", + "model_name_or_path = \"bert-large-uncased-whole-word-masking-finetuned-squad\"\n", "max_seq_length = 128\n", "doc_stride = 128\n", "max_query_length = 64" @@ -151,16 +184,16 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 48/48 [00:04<00:00, 11.28it/s]\n", - "convert squad examples to features: 100%|██████████| 1000/1000 [00:09<00:00, 102.15it/s]\n", - "add example index and unique id: 100%|██████████| 1000/1000 [00:00<00:00, 161306.98it/s]\n" + "100%|██████████| 48/48 [00:03<00:00, 14.24it/s]\n", + "convert squad examples to features: 100%|██████████| 1000/1000 [00:08<00:00, 112.67it/s]\n", + "add example index and unique id: 100%|██████████| 1000/1000 [00:00<00:00, 836518.55it/s]\n" ] } ], @@ -206,9 +239,17 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/disk/conda3/envs/gpu_env/lib/python3.6/site-packages/transformers/modeling_utils.py:1974: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n", + " input_tensor.shape[chunk_dim] == tensor_shape for input_tensor in input_tensors\n" + ] + }, { "name": "stdout", "output_type": "stream", @@ -270,14 +311,14 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "PyTorch cuda Inference time = 16.57 ms\n" + "PyTorch cuda Inference time = 16.56 ms\n" ] } ], @@ -307,47 +348,7 @@ "## 4. Inference ONNX Model with ONNX Runtime ##\n", "\n", "### CUDA and cuDNN Path\n", - "onnxruntime-gpu has dependency on [CUDA](https://developer.nvidia.com/cuda-downloads) and [cuDNN](https://developer.nvidia.com/cudnn):\n", - "\n", - "* [onnxruntime-gpu v1.3.0](https://github.com/microsoft/onnxruntime/tree/rel-1.3.0#system-requirements) requires CUDA Runtime 10.1 and CUDNN 7.6.5.\n", - "* [onnxruntime-gpu v1.2.0](https://github.com/microsoft/onnxruntime/releases/tag/v1.2.0) requires CUDA Runtime 10.1 and CUDNN 7.6.5.\n", - "\n", - "During installing PyTorch 1.5, we installed cudatoolkit 10.1.243 in this conda environment. That shall be good for onnxruntime-gpu 1.3.0 in Jupyter Notebook." - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "# Change to True when onnxruntime (like onnxruntime-gpu 1.0.0 ~ 1.1.2) cannot be imported.\n", - "add_cuda_path = False\n", - "\n", - "if add_cuda_path:\n", - " # Add path of CUDA 10.0 and CUDNN 7.6 for onnxruntime-gpu 1.0.0 ~ 1.1.2\n", - " cuda_dir = 'D:/NVidia/CUDA/v10.1/bin'\n", - " cudnn_dir = 'D:/NVidia/CUDA/v10.1/bin'\n", - " if not (os.path.exists(cuda_dir) and os.path.exists(cudnn_dir)):\n", - " raise ValueError(\"Please specify correct path for CUDA and cuDNN. Otherwise onnxruntime cannot be imported.\")\n", - " else:\n", - " if cuda_dir == cudnn_dir:\n", - " os.environ[\"PATH\"] = cuda_dir + ';' + os.environ[\"PATH\"]\n", - " else:\n", - " os.environ[\"PATH\"] = cuda_dir + ';' + cudnn_dir + ';' + os.environ[\"PATH\"]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### OpenMP Environment Variable\n", - "\n", - "OpenMP environment variables are optional for GPU inference of standard Bert model. It has little performance impact on Bert model since most nodes are executed in GPU. \n", - "\n", - "You can find the best setting based on [Performance Test Tool](#Performance-Test-Tool) result in later part of this notebook.\n", - "\n", - "**Attention: Setting environment variables shall be done before importing onnxruntime**. Otherwise, they might not take effect." + "onnxruntime-gpu has dependency on [CUDA](https://developer.nvidia.com/cuda-downloads) and [cuDNN](https://developer.nvidia.com/cudnn). Required CUDA version can be found [here](http://www.onnxruntime.ai/docs/reference/execution-providers/CUDA-ExecutionProvider.html#requirements)\n" ] }, { @@ -356,9 +357,21 @@ "metadata": {}, "outputs": [], "source": [ - "# Optional. You can change them according to Performance Test Tool result.\n", - "#os.environ[\"OMP_NUM_THREADS\"] = '1'\n", - "#os.environ[\"OMP_WAIT_POLICY\"] = 'PASSIVE'" + "# Change to True when onnxruntime (like onnxruntime-gpu 1.0.0 ~ 1.1.2) cannot be imported.\n", + "add_cuda_path = False\n", + "\n", + "# For Linux, see https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#environment-setup\n", + "# Below is example for Windows\n", + "if add_cuda_path:\n", + " cuda_dir = 'D:/NVidia/CUDA/v11.0/bin'\n", + " cudnn_dir = 'D:/NVidia/CUDA/v11.0/bin'\n", + " if not (os.path.exists(cuda_dir) and os.path.exists(cudnn_dir)):\n", + " raise ValueError(\"Please specify correct path for CUDA and cuDNN. Otherwise onnxruntime cannot be imported.\")\n", + " else:\n", + " if cuda_dir == cudnn_dir:\n", + " os.environ[\"PATH\"] = cuda_dir + ';' + os.environ[\"PATH\"]\n", + " else:\n", + " os.environ[\"PATH\"] = cuda_dir + ';' + cudnn_dir + ';' + os.environ[\"PATH\"]" ] }, { @@ -377,7 +390,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "OnnxRuntime gpu Inference time = 4.43 ms\n" + "OnnxRuntime gpu Inference time = 25.28 ms\n" ] } ], @@ -403,7 +416,7 @@ "latency = []\n", "for i in range(total_samples):\n", " data = dataset[i]\n", - " # TODO: use IO Binding (see https://github.com/microsoft/onnxruntime/pull/4206) to improve performance.\n", + " # TODO: use IO Binding (see https://www.onnxruntime.ai/python/api_summary.html) to improve performance.\n", " ort_inputs = {\n", " 'input_ids': data[0].cpu().reshape(1, max_seq_length).numpy(),\n", " 'input_mask': data[1].cpu().reshape(1, max_seq_length).numpy(),\n", @@ -436,9 +449,9 @@ "text": [ "***** Verifying correctness *****\n", "PyTorch and ONNX Runtime output 0 are close: True\n", - "maximum_diff=9.499490261077881e-07 average_diff=1.4225952327251434e-07\n", + "maximum_diff=5.7220458984375e-06 average_diff=1.3103708624839783e-06\n", "PyTorch and ONNX Runtime output 1 are close: True\n", - "maximum_diff=6.92903995513916e-07 average_diff=1.2441887520253658e-07\n" + "maximum_diff=5.7220458984375e-06 average_diff=1.2257369235157967e-06\n" ] } ], @@ -472,13 +485,13 @@ { "data": { "text/plain": [ - "{'input_ids': tensor([[ 101, 1293, 1242, 2557, 1127, 1226, 1104, 1103, 3613, 16429,\n", - " 5235, 136, 102, 3613, 16429, 5988, 170, 107, 1353, 1671,\n", - " 1992, 1342, 107, 5235, 117, 1107, 1134, 1473, 3683, 3538,\n", - " 1125, 170, 1476, 118, 1248, 2595, 4086, 1714, 1104, 2965,\n", - " 15897, 1104, 3613, 16429, 119, 1473, 3683, 3538, 3222, 1149,\n", - " 2551, 1168, 23759, 1116, 1121, 1506, 1103, 10280, 2231, 1111,\n", - " 1103, 1714, 16355, 119, 102, 0, 0, 0, 0, 0,\n", + "{'input_ids': tensor([[ 101, 2054, 2329, 2694, 2897, 2097, 4287, 1996, 3565, 4605,\n", + " 1029, 102, 1999, 1996, 2142, 2983, 1010, 4035, 2557, 1019,\n", + " 2444, 1998, 1019, 2444, 2998, 4469, 2097, 4287, 1996, 5049,\n", + " 1012, 1996, 4035, 2097, 4287, 2049, 2219, 2329, 2394, 3743,\n", + " 1010, 2007, 6754, 10184, 1010, 12270, 10589, 1998, 6857, 8945,\n", + " 18505, 2006, 8570, 1012, 102, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", @@ -488,13 +501,13 @@ " device='cuda:0'),\n", " 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", - " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,\n", + " 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0]], device='cuda:0'),\n", - " 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", + " 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", - " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,\n", + " 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0]], device='cuda:0')}" @@ -526,8 +539,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "Average length 101\n", - "OnnxRuntime gpu Inference time with actual sequence length = 4.23 ms\n" + "Average length 94\n", + "OnnxRuntime gpu Inference time with actual sequence length = 21.93 ms\n" ] } ], @@ -611,12 +624,12 @@ "\n", "Example Usage:\n", "```\n", - "from onnxruntime_tools import optimizer\n", + "from onnxruntime.transformers import optimizer\n", "optimized_model = optimizer.optimize_model(export_model_path, model_type='bert', num_heads=12, hidden_size=768)\n", "optimized_model.save_model_to_file(optimized_model_path)\n", "```\n", "\n", - "You can also use optimizer_cli like the following:" + "You can also use command line like the following:" ] }, { @@ -638,20 +651,21 @@ "name": "stdout", "output_type": "stream", "text": [ - "optimize_by_onnxruntime: Save optimized model by onnxruntime to ./onnx/bert-base-cased-squad_opset11_o1_cpu.onnx\n", - " apply: Fused LayerNormalization count: 25\n", - " apply: Fused Gelu count: 12\n", - " apply: Fused SkipLayerNormalization count: 25\n", - " apply: Fused Attention count: 12\n", + " apply: Fused LayerNormalization count: 49\n", + " apply: Fused Gelu count: 24\n", + "adjust_reshape_and_expand: Removed Reshape and Expand count: 0\n", + " apply: Fused SkipLayerNormalization count: 48\n", + " apply: Fused Attention count: 24\n", " prune_graph: Graph pruned: 0 inputs, 0 outputs and 5 nodes are removed\n", " apply: Fused EmbedLayerNormalization(with mask) count: 1\n", - " prune_graph: Graph pruned: 0 inputs, 0 outputs and 12 nodes are removed\n", + " prune_graph: Graph pruned: 0 inputs, 0 outputs and 3 nodes are removed\n", " prune_graph: Graph pruned: 0 inputs, 0 outputs and 0 nodes are removed\n", - " apply: Fused BiasGelu count: 12\n", - " apply: Fused SkipLayerNormalization(add bias) count: 24\n", + " apply: Fused BiasGelu count: 24\n", + " apply: Fused SkipLayerNormalization(add bias) count: 48\n", " optimize: opset verion: 11\n", + " save_model_to_file: Sort graphs in topological order\n", " save_model_to_file: Output model to ./onnx/bert-base-cased-squad_opt_gpu_fp32.onnx\n", - "get_fused_operator_statistics: Optimized operators:{'EmbedLayerNormalization': 1, 'Attention': 12, 'Gelu': 0, 'FastGelu': 0, 'BiasGelu': 12, 'LayerNormalization': 0, 'SkipLayerNormalization': 24}\n", + "get_fused_operator_statistics: Optimized operators:{'EmbedLayerNormalization': 1, 'Attention': 24, 'Gelu': 0, 'FastGelu': 0, 'BiasGelu': 24, 'LayerNormalization': 0, 'SkipLayerNormalization': 48}\n", " main: The model has been fully optimized.\n" ] } @@ -659,7 +673,7 @@ "source": [ "optimized_fp32_model_path = './onnx/bert-base-cased-squad_opt_{}_fp32.onnx'.format('gpu' if use_gpu else 'cpu')\n", "\n", - "!python -m onnxruntime_tools.optimizer_cli --input $export_model_path --output $optimized_fp32_model_path" + "!{sys.executable} -m onnxruntime.transformers.optimizer --input $export_model_path --output $optimized_fp32_model_path" ] }, { @@ -712,32 +726,24 @@ "name": "stdout", "output_type": "stream", "text": [ - "test setting TestSetting(batch_size=1, sequence_length=128, test_cases=1000, test_times=1, contiguous=None, use_gpu=True, warmup=True, omp_num_threads=None, omp_wait_policy=None, intra_op_num_threads=None, seed=3, verbose=False, inclusive=False, extra_latency=True)\n", + "test setting TestSetting(batch_size=1, sequence_length=128, test_cases=1000, test_times=1, use_gpu=True, intra_op_num_threads=None, seed=3, verbose=False)\n", "Generating 1000 samples for batch_size=1 sequence_length=128\n", - "Running test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=0,OMP_NUM_THREADS=,OMP_WAIT_POLICY=,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", - "Average latency = 4.92 ms, Throughput = 203.24 QPS\n", - "Running test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=PASSIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", - "Average latency = 4.90 ms, Throughput = 203.88 QPS\n", - "Running test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=12,OMP_NUM_THREADS=1,OMP_WAIT_POLICY=ACTIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", - "Average latency = 5.07 ms, Throughput = 197.16 QPS\n", - "Running test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=ACTIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", - "Average latency = 4.82 ms, Throughput = 207.33 QPS\n", - "skip duplicated test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=PASSIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", - "skip duplicated test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=12,OMP_NUM_THREADS=1,OMP_WAIT_POLICY=ACTIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", - "Running test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=12,OMP_NUM_THREADS=1,OMP_WAIT_POLICY=PASSIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", - "Average latency = 4.93 ms, Throughput = 202.92 QPS\n", - "Running test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=12,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=ACTIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", - "Average latency = 4.91 ms, Throughput = 203.55 QPS\n", - "Running test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=12,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=PASSIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", - "Average latency = 4.88 ms, Throughput = 204.90 QPS\n", - "Test summary is saved to onnx/perf_results_GPU_B1_S128_20200617-232134.txt\n" + "Running test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=4,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True\n", + "Average latency = 23.72 ms, Throughput = 42.15 QPS\n", + "Running test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=3,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True\n", + "Average latency = 24.24 ms, Throughput = 41.25 QPS\n", + "Running test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=2,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True\n", + "Average latency = 24.36 ms, Throughput = 41.05 QPS\n", + "Running test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True\n", + "Average latency = 24.39 ms, Throughput = 41.01 QPS\n", + "Test summary is saved to onnx/perf_results_GPU_B1_S128_20210714-001817.txt\n" ] } ], "source": [ "GPU_OPTION = '--use_gpu' if use_gpu else ''\n", "\n", - "!python -m onnxruntime_tools.transformers.bert_perf_test --model $optimized_fp32_model_path --batch_size 1 --sequence_length 128 --samples 1000 --test_times 1 --inclusive --all $GPU_OPTION" + "!{sys.executable} -m onnxruntime.transformers.bert_perf_test --model $optimized_fp32_model_path --batch_size 1 --sequence_length 128 --samples 1000 --test_times 1 $GPU_OPTION" ] }, { @@ -756,7 +762,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Float32 model perf results from ./onnx/perf_results_GPU_B1_S128_20200617-232134.txt\n" + "Float32 model perf results from ./onnx/perf_results_GPU_B1_S128_20210714-001817.txt\n" ] }, { @@ -788,117 +794,52 @@ " Latency_P99\n", " Throughput(QPS)\n", " intra_op_num_threads\n", - " OMP_NUM_THREADS\n", - " OMP_WAIT_POLICY\n", - " contiguous\n", - " warmup\n", " \n", " \n", " \n", " \n", " 0\n", - " 4.82\n", - " 4.53\n", - " 4.57\n", - " 5.15\n", - " 7.25\n", - " 8.75\n", - " 207.33\n", - " 1\n", - " 12\n", - " ACTIVE\n", - " None\n", - " True\n", + " 23.72\n", + " 23.72\n", + " 23.87\n", + " 23.99\n", + " 24.11\n", + " 24.37\n", + " 42.15\n", + " 4\n", " \n", " \n", " 1\n", - " 4.88\n", - " 4.54\n", - " 4.58\n", - " 6.47\n", - " 7.13\n", - " 8.68\n", - " 204.90\n", - " 12\n", - " 12\n", - " PASSIVE\n", - " None\n", - " True\n", + " 24.24\n", + " 24.24\n", + " 24.42\n", + " 24.60\n", + " 24.76\n", + " 25.23\n", + " 41.25\n", + " 3\n", " \n", " \n", " 2\n", - " 4.90\n", - " 4.54\n", - " 4.57\n", - " 6.16\n", - " 7.64\n", - " 8.82\n", - " 203.88\n", - " 1\n", - " 12\n", - " PASSIVE\n", - " None\n", - " True\n", + " 24.36\n", + " 24.36\n", + " 24.47\n", + " 24.69\n", + " 25.01\n", + " 26.52\n", + " 41.05\n", + " 2\n", " \n", " \n", " 3\n", - " 4.91\n", - " 4.55\n", - " 4.59\n", - " 6.70\n", - " 7.43\n", - " 8.78\n", - " 203.55\n", - " 12\n", - " 12\n", - " ACTIVE\n", - " None\n", - " True\n", - " \n", - " \n", - " 4\n", - " 4.92\n", - " 4.57\n", - " 4.60\n", - " 6.50\n", - " 7.82\n", - " 8.90\n", - " 203.24\n", - " 0\n", - " \n", - " \n", - " None\n", - " True\n", - " \n", - " \n", - " 5\n", - " 4.93\n", - " 4.55\n", - " 4.59\n", - " 6.66\n", - " 7.57\n", - " 8.80\n", - " 202.92\n", - " 12\n", + " 24.39\n", + " 24.37\n", + " 24.47\n", + " 24.65\n", + " 24.73\n", + " 25.12\n", + " 41.01\n", " 1\n", - " PASSIVE\n", - " None\n", - " True\n", - " \n", - " \n", - " 6\n", - " 5.07\n", - " 4.56\n", - " 4.61\n", - " 7.19\n", - " 8.11\n", - " 9.01\n", - " 197.16\n", - " 12\n", - " 1\n", - " ACTIVE\n", - " None\n", - " True\n", " \n", " \n", "\n", @@ -906,31 +847,16 @@ ], "text/plain": [ " Latency(ms) Latency_P50 Latency_P75 Latency_P90 Latency_P95 \\\n", - "0 4.82 4.53 4.57 5.15 7.25 \n", - "1 4.88 4.54 4.58 6.47 7.13 \n", - "2 4.90 4.54 4.57 6.16 7.64 \n", - "3 4.91 4.55 4.59 6.70 7.43 \n", - "4 4.92 4.57 4.60 6.50 7.82 \n", - "5 4.93 4.55 4.59 6.66 7.57 \n", - "6 5.07 4.56 4.61 7.19 8.11 \n", + "0 23.72 23.72 23.87 23.99 24.11 \n", + "1 24.24 24.24 24.42 24.60 24.76 \n", + "2 24.36 24.36 24.47 24.69 25.01 \n", + "3 24.39 24.37 24.47 24.65 24.73 \n", "\n", - " Latency_P99 Throughput(QPS) intra_op_num_threads OMP_NUM_THREADS \\\n", - "0 8.75 207.33 1 12 \n", - "1 8.68 204.90 12 12 \n", - "2 8.82 203.88 1 12 \n", - "3 8.78 203.55 12 12 \n", - "4 8.90 203.24 0 \n", - "5 8.80 202.92 12 1 \n", - "6 9.01 197.16 12 1 \n", - "\n", - " OMP_WAIT_POLICY contiguous warmup \n", - "0 ACTIVE None True \n", - "1 PASSIVE None True \n", - "2 PASSIVE None True \n", - "3 ACTIVE None True \n", - "4 None True \n", - "5 PASSIVE None True \n", - "6 ACTIVE None True " + " Latency_P99 Throughput(QPS) intra_op_num_threads \n", + "0 24.37 42.15 4 \n", + "1 25.23 41.25 3 \n", + "2 26.52 41.05 2 \n", + "3 25.12 41.01 1 " ] }, "execution_count": 18, @@ -943,7 +869,7 @@ "import glob \n", "import pandas\n", "latest_result_file = max(glob.glob(\"./onnx/perf_results_GPU_B1_S128_*.txt\"), key=os.path.getmtime)\n", - "result_data = pandas.read_table(latest_result_file, converters={'OMP_NUM_THREADS': str, 'OMP_WAIT_POLICY':str})\n", + "result_data = pandas.read_table(latest_result_file)\n", "print(\"Float32 model perf results from\", latest_result_file)\n", "# Remove some columns that have same values for all rows.\n", "columns_to_remove = ['model', 'graph_optimization_level', 'batch_size', 'sequence_length', 'test_cases', 'test_times', 'use_gpu']\n", @@ -974,13 +900,13 @@ "output_type": "stream", "text": [ "100% passed for 100 random inputs given thresholds (rtol=0.01, atol=0.01).\r\n", - "maximum absolute difference=1.9222497940063477e-06\r\n", - "maximum relative difference=0.05027933046221733\r\n" + "maximum absolute difference=5.316734313964844e-05\r\n", + "maximum relative difference=0.00012461667938623577\r\n" ] } ], "source": [ - "!python -m onnxruntime_tools.transformers.compare_bert_results --baseline_model $export_model_path --optimized_model $optimized_fp32_model_path --batch_size 1 --sequence_length 128 --samples 100 --rtol 0.01 --atol 0.01 $GPU_OPTION" + "!{sys.executable} -m onnxruntime.transformers.compare_bert_results --baseline_model $export_model_path --optimized_model $optimized_fp32_model_path --batch_size 1 --sequence_length 128 --samples 100 --rtol 0.01 --atol 0.01 $GPU_OPTION" ] }, { @@ -1003,27 +929,28 @@ "name": "stdout", "output_type": "stream", "text": [ - "optimize_by_onnxruntime: Save optimized model by onnxruntime to ./onnx/bert-base-cased-squad_opset11_o1_cpu.onnx\n", - " apply: Fused LayerNormalization count: 25\n", - " apply: Fused Gelu count: 12\n", - " apply: Fused SkipLayerNormalization count: 25\n", - " apply: Fused Attention count: 12\n", + " apply: Fused LayerNormalization count: 49\n", + " apply: Fused Gelu count: 24\n", + "adjust_reshape_and_expand: Removed Reshape and Expand count: 0\n", + " apply: Fused SkipLayerNormalization count: 48\n", + " apply: Fused Attention count: 24\n", " prune_graph: Graph pruned: 0 inputs, 0 outputs and 5 nodes are removed\n", " apply: Fused EmbedLayerNormalization(with mask) count: 1\n", - " prune_graph: Graph pruned: 0 inputs, 0 outputs and 12 nodes are removed\n", + " prune_graph: Graph pruned: 0 inputs, 0 outputs and 3 nodes are removed\n", " prune_graph: Graph pruned: 0 inputs, 0 outputs and 0 nodes are removed\n", - " apply: Fused BiasGelu count: 12\n", - " apply: Fused SkipLayerNormalization(add bias) count: 24\n", + " apply: Fused BiasGelu count: 24\n", + " apply: Fused SkipLayerNormalization(add bias) count: 48\n", " optimize: opset verion: 11\n", + " save_model_to_file: Sort graphs in topological order\n", " save_model_to_file: Output model to ./onnx/bert-base-cased-squad_opt_gpu_fp16.onnx\n", - "get_fused_operator_statistics: Optimized operators:{'EmbedLayerNormalization': 1, 'Attention': 12, 'Gelu': 0, 'FastGelu': 0, 'BiasGelu': 12, 'LayerNormalization': 0, 'SkipLayerNormalization': 24}\n", + "get_fused_operator_statistics: Optimized operators:{'EmbedLayerNormalization': 1, 'Attention': 24, 'Gelu': 0, 'FastGelu': 0, 'BiasGelu': 24, 'LayerNormalization': 0, 'SkipLayerNormalization': 48}\n", " main: The model has been fully optimized.\n" ] } ], "source": [ "optimized_fp16_model_path = './onnx/bert-base-cased-squad_opt_{}_fp16.onnx'.format('gpu' if use_gpu else 'cpu')\n", - "!python -m onnxruntime_tools.optimizer_cli --input $export_model_path --output $optimized_fp16_model_path --float16" + "!{sys.executable} -m onnxruntime.transformers.optimizer --input $export_model_path --output $optimized_fp16_model_path --float16" ] }, { @@ -1035,31 +962,23 @@ "name": "stdout", "output_type": "stream", "text": [ - "test setting TestSetting(batch_size=1, sequence_length=128, test_cases=1000, test_times=1, contiguous=None, use_gpu=True, warmup=True, omp_num_threads=None, omp_wait_policy=None, intra_op_num_threads=None, seed=3, verbose=False, inclusive=False, extra_latency=True)\n", + "test setting TestSetting(batch_size=1, sequence_length=128, test_cases=1000, test_times=1, use_gpu=True, intra_op_num_threads=None, seed=3, verbose=False)\n", "Generating 1000 samples for batch_size=1 sequence_length=128\n", - "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=0,OMP_NUM_THREADS=,OMP_WAIT_POLICY=,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", - "Average latency = 3.01 ms, Throughput = 331.90 QPS\n", - "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=PASSIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", - "Average latency = 3.12 ms, Throughput = 320.00 QPS\n", - "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=12,OMP_NUM_THREADS=1,OMP_WAIT_POLICY=ACTIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", - "Average latency = 3.02 ms, Throughput = 331.39 QPS\n", - "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=ACTIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", - "Average latency = 3.01 ms, Throughput = 332.53 QPS\n", - "skip duplicated test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=PASSIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", - "skip duplicated test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=12,OMP_NUM_THREADS=1,OMP_WAIT_POLICY=ACTIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", - "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=12,OMP_NUM_THREADS=1,OMP_WAIT_POLICY=PASSIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", - "Average latency = 3.04 ms, Throughput = 328.67 QPS\n", - "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=12,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=ACTIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", - "Average latency = 3.01 ms, Throughput = 331.72 QPS\n", - "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=12,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=PASSIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", - "Average latency = 3.04 ms, Throughput = 329.32 QPS\n", - "Test summary is saved to onnx/perf_results_GPU_B1_S128_20200617-232234.txt\n" + "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=4,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True\n", + "Average latency = 6.78 ms, Throughput = 147.54 QPS\n", + "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=3,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True\n", + "Average latency = 6.76 ms, Throughput = 147.85 QPS\n", + "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=2,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True\n", + "Average latency = 6.79 ms, Throughput = 147.30 QPS\n", + "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True\n", + "Average latency = 6.81 ms, Throughput = 146.75 QPS\n", + "Test summary is saved to onnx/perf_results_GPU_B1_S128_20210714-002224.txt\n" ] } ], "source": [ "GPU_OPTION = '--use_gpu' if use_gpu else ''\n", - "!python -m onnxruntime_tools.transformers.bert_perf_test --model $optimized_fp16_model_path --batch_size 1 --sequence_length 128 --samples 1000 --test_times 1 --inclusive --all $GPU_OPTION" + "!python -m onnxruntime.transformers.bert_perf_test --model $optimized_fp16_model_path --batch_size 1 --sequence_length 128 --samples 1000 --test_times 1 $GPU_OPTION" ] }, { @@ -1071,7 +990,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Float32 model perf results from ./onnx/perf_results_GPU_B1_S128_20200617-232234.txt\n" + "Float32 model perf results from ./onnx/perf_results_GPU_B1_S128_20210714-002224.txt\n" ] }, { @@ -1103,117 +1022,52 @@ " Latency_P99\n", " Throughput(QPS)\n", " intra_op_num_threads\n", - " OMP_NUM_THREADS\n", - " OMP_WAIT_POLICY\n", - " contiguous\n", - " warmup\n", " \n", " \n", " \n", " \n", " 0\n", - " 3.01\n", - " 2.79\n", - " 2.81\n", - " 2.86\n", - " 5.08\n", - " 7.16\n", - " 332.53\n", - " 1\n", - " 12\n", - " ACTIVE\n", - " None\n", - " True\n", + " 6.76\n", + " 6.79\n", + " 6.81\n", + " 6.90\n", + " 6.91\n", + " 7.00\n", + " 147.85\n", + " 3\n", " \n", " \n", " 1\n", - " 3.01\n", - " 2.80\n", - " 2.81\n", - " 2.88\n", - " 4.52\n", - " 7.05\n", - " 331.90\n", - " 0\n", - " \n", - " \n", - " None\n", - " True\n", + " 6.78\n", + " 6.70\n", + " 6.79\n", + " 6.87\n", + " 6.90\n", + " 7.63\n", + " 147.54\n", + " 4\n", " \n", " \n", " 2\n", - " 3.01\n", - " 2.78\n", - " 2.80\n", - " 2.92\n", - " 5.01\n", - " 7.02\n", - " 331.72\n", - " 12\n", - " 12\n", - " ACTIVE\n", - " None\n", - " True\n", + " 6.79\n", + " 6.79\n", + " 6.81\n", + " 6.89\n", + " 6.91\n", + " 7.19\n", + " 147.30\n", + " 2\n", " \n", " \n", " 3\n", - " 3.02\n", - " 2.79\n", - " 2.80\n", - " 2.85\n", - " 6.34\n", - " 7.04\n", - " 331.39\n", - " 12\n", - " 1\n", - " ACTIVE\n", - " None\n", - " True\n", - " \n", - " \n", - " 4\n", - " 3.04\n", - " 2.80\n", - " 2.82\n", - " 2.93\n", - " 5.56\n", - " 7.08\n", - " 329.32\n", - " 12\n", - " 12\n", - " PASSIVE\n", - " None\n", - " True\n", - " \n", - " \n", - " 5\n", - " 3.04\n", - " 2.79\n", - " 2.81\n", - " 2.92\n", - " 6.37\n", - " 7.08\n", - " 328.67\n", - " 12\n", - " 1\n", - " PASSIVE\n", - " None\n", - " True\n", - " \n", - " \n", - " 6\n", - " 3.12\n", - " 2.79\n", - " 2.82\n", - " 2.96\n", - " 6.66\n", + " 6.81\n", + " 6.80\n", + " 6.89\n", + " 6.91\n", + " 6.97\n", " 7.20\n", - " 320.00\n", + " 146.75\n", " 1\n", - " 12\n", - " PASSIVE\n", - " None\n", - " True\n", " \n", " \n", "\n", @@ -1221,31 +1075,16 @@ ], "text/plain": [ " Latency(ms) Latency_P50 Latency_P75 Latency_P90 Latency_P95 \\\n", - "0 3.01 2.79 2.81 2.86 5.08 \n", - "1 3.01 2.80 2.81 2.88 4.52 \n", - "2 3.01 2.78 2.80 2.92 5.01 \n", - "3 3.02 2.79 2.80 2.85 6.34 \n", - "4 3.04 2.80 2.82 2.93 5.56 \n", - "5 3.04 2.79 2.81 2.92 6.37 \n", - "6 3.12 2.79 2.82 2.96 6.66 \n", + "0 6.76 6.79 6.81 6.90 6.91 \n", + "1 6.78 6.70 6.79 6.87 6.90 \n", + "2 6.79 6.79 6.81 6.89 6.91 \n", + "3 6.81 6.80 6.89 6.91 6.97 \n", "\n", - " Latency_P99 Throughput(QPS) intra_op_num_threads OMP_NUM_THREADS \\\n", - "0 7.16 332.53 1 12 \n", - "1 7.05 331.90 0 \n", - "2 7.02 331.72 12 12 \n", - "3 7.04 331.39 12 1 \n", - "4 7.08 329.32 12 12 \n", - "5 7.08 328.67 12 1 \n", - "6 7.20 320.00 1 12 \n", - "\n", - " OMP_WAIT_POLICY contiguous warmup \n", - "0 ACTIVE None True \n", - "1 None True \n", - "2 ACTIVE None True \n", - "3 ACTIVE None True \n", - "4 PASSIVE None True \n", - "5 PASSIVE None True \n", - "6 PASSIVE None True " + " Latency_P99 Throughput(QPS) intra_op_num_threads \n", + "0 7.00 147.85 3 \n", + "1 7.63 147.54 4 \n", + "2 7.19 147.30 2 \n", + "3 7.20 146.75 1 " ] }, "execution_count": 22, @@ -1258,7 +1097,7 @@ "import glob \n", "import pandas\n", "latest_result_file = max(glob.glob(\"./onnx/perf_results_GPU_B1_S128_*.txt\"), key=os.path.getmtime)\n", - "result_data = pandas.read_table(latest_result_file, converters={'OMP_NUM_THREADS': str, 'OMP_WAIT_POLICY':str})\n", + "result_data = pandas.read_table(latest_result_file)\n", "print(\"Float32 model perf results from\", latest_result_file)\n", "# Remove some columns that have same values for all rows.\n", "columns_to_remove = ['model', 'graph_optimization_level', 'batch_size', 'sequence_length', 'test_cases', 'test_times', 'use_gpu']\n", @@ -1286,47 +1125,43 @@ "name": "stdout", "output_type": "stream", "text": [ - "test setting TestSetting(batch_size=32, sequence_length=128, test_cases=1000, test_times=1, contiguous=None, use_gpu=True, warmup=True, omp_num_threads=12, omp_wait_policy='ACTIVE', intra_op_num_threads=1, seed=3, verbose=False, inclusive=False, extra_latency=True)\n", + "test setting TestSetting(batch_size=32, sequence_length=128, test_cases=1000, test_times=1, use_gpu=True, intra_op_num_threads=3, seed=3, verbose=False)\n", "Generating 1000 samples for batch_size=32 sequence_length=128\n", - "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=ACTIVE,batch_size=32,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", - "Average latency = 16.17 ms, Throughput = 1979.41 QPS\n", - "test setting TestSetting(batch_size=1, sequence_length=128, test_cases=1000, test_times=1, contiguous=None, use_gpu=True, warmup=True, omp_num_threads=12, omp_wait_policy='ACTIVE', intra_op_num_threads=1, seed=3, verbose=False, inclusive=False, extra_latency=True)\n", + "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=3,batch_size=32,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True\n", + "Average latency = 168.40 ms, Throughput = 190.02 QPS\n", + "test setting TestSetting(batch_size=1, sequence_length=128, test_cases=1000, test_times=1, use_gpu=True, intra_op_num_threads=3, seed=3, verbose=False)\n", "Generating 1000 samples for batch_size=1 sequence_length=128\n", - "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=ACTIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", - "Average latency = 3.00 ms, Throughput = 333.83 QPS\n", - "test setting TestSetting(batch_size=2, sequence_length=128, test_cases=1000, test_times=1, contiguous=None, use_gpu=True, warmup=True, omp_num_threads=12, omp_wait_policy='ACTIVE', intra_op_num_threads=1, seed=3, verbose=False, inclusive=False, extra_latency=True)\n", + "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=3,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True\n", + "Average latency = 7.14 ms, Throughput = 140.00 QPS\n", + "test setting TestSetting(batch_size=2, sequence_length=128, test_cases=1000, test_times=1, use_gpu=True, intra_op_num_threads=3, seed=3, verbose=False)\n", "Generating 1000 samples for batch_size=2 sequence_length=128\n", - "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=ACTIVE,batch_size=2,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", - "Average latency = 3.59 ms, Throughput = 557.32 QPS\n", - "test setting TestSetting(batch_size=64, sequence_length=128, test_cases=1000, test_times=1, contiguous=None, use_gpu=True, warmup=True, omp_num_threads=12, omp_wait_policy='ACTIVE', intra_op_num_threads=1, seed=3, verbose=False, inclusive=False, extra_latency=True)\n", - "Generating 1000 samples for batch_size=64 sequence_length=128\n", - "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=ACTIVE,batch_size=64,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", - "Average latency = 29.26 ms, Throughput = 2187.15 QPS\n", - "test setting TestSetting(batch_size=4, sequence_length=128, test_cases=1000, test_times=1, contiguous=None, use_gpu=True, warmup=True, omp_num_threads=12, omp_wait_policy='ACTIVE', intra_op_num_threads=1, seed=3, verbose=False, inclusive=False, extra_latency=True)\n", + "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=3,batch_size=2,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True\n", + "Average latency = 11.27 ms, Throughput = 177.41 QPS\n", + "test setting TestSetting(batch_size=4, sequence_length=128, test_cases=1000, test_times=1, use_gpu=True, intra_op_num_threads=3, seed=3, verbose=False)\n", "Generating 1000 samples for batch_size=4 sequence_length=128\n", - "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=ACTIVE,batch_size=4,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", - "Average latency = 4.32 ms, Throughput = 926.92 QPS\n", - "test setting TestSetting(batch_size=8, sequence_length=128, test_cases=1000, test_times=1, contiguous=None, use_gpu=True, warmup=True, omp_num_threads=12, omp_wait_policy='ACTIVE', intra_op_num_threads=1, seed=3, verbose=False, inclusive=False, extra_latency=True)\n", + "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=3,batch_size=4,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True\n", + "Average latency = 21.15 ms, Throughput = 189.09 QPS\n", + "test setting TestSetting(batch_size=8, sequence_length=128, test_cases=1000, test_times=1, use_gpu=True, intra_op_num_threads=3, seed=3, verbose=False)\n", "Generating 1000 samples for batch_size=8 sequence_length=128\n", - "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=ACTIVE,batch_size=8,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", - "Average latency = 6.32 ms, Throughput = 1266.63 QPS\n", - "test setting TestSetting(batch_size=16, sequence_length=128, test_cases=1000, test_times=1, contiguous=None, use_gpu=True, warmup=True, omp_num_threads=12, omp_wait_policy='ACTIVE', intra_op_num_threads=1, seed=3, verbose=False, inclusive=False, extra_latency=True)\n", + "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=3,batch_size=8,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True\n", + "Average latency = 42.27 ms, Throughput = 189.27 QPS\n", + "test setting TestSetting(batch_size=16, sequence_length=128, test_cases=1000, test_times=1, use_gpu=True, intra_op_num_threads=3, seed=3, verbose=False)\n", "Generating 1000 samples for batch_size=16 sequence_length=128\n", - "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=ACTIVE,batch_size=16,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n", - "Average latency = 9.60 ms, Throughput = 1666.05 QPS\n", - "Test summary is saved to onnx/perf_results_GPU_B1-2-4-8-16-32-64_S128_20200617-232401.txt\n" + "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=3,batch_size=16,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True\n", + "Average latency = 83.77 ms, Throughput = 191.01 QPS\n", + "Test summary is saved to onnx/perf_results_GPU_B1-2-4-8-16-32_S128_20210714-002816.txt\n" ] } ], "source": [ "GPU_OPTION = '--use_gpu' if use_gpu else ''\n", - "THREAD_SETTING = '--intra_op_num_threads 1 --omp_num_threads {} --omp_wait_policy ACTIVE'.format(psutil.cpu_count(logical=True))\n", - "!python -m onnxruntime_tools.transformers.bert_perf_test --model $optimized_fp16_model_path --batch_size 1 2 4 8 16 32 64 --sequence_length 128 --samples 1000 --test_times 1 --inclusive $THREAD_SETTING $GPU_OPTION" + "THREAD_SETTING = '--intra_op_num_threads 3'\n", + "!{sys.executable} -m onnxruntime.transformers.bert_perf_test --model $optimized_fp16_model_path --batch_size 1 2 4 8 16 32 --sequence_length 128 --samples 1000 --test_times 1 $THREAD_SETTING $GPU_OPTION" ] }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 24, "metadata": { "scrolled": false }, @@ -1335,7 +1170,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Float16 model summary from ./onnx/perf_results_GPU_B1-2-4-8-16-32-64_S128_20200617-232401.txt\n" + "Float16 model summary from ./onnx/perf_results_GPU_B1-2-4-8-16-32_S128_20210714-002816.txt\n" ] }, { @@ -1372,106 +1207,93 @@ " \n", " \n", " 0\n", - " 3.00\n", - " 2.79\n", - " 2.81\n", - " 2.86\n", - " 4.37\n", - " 7.08\n", - " 333.83\n", + " 7.14\n", + " 7.10\n", + " 7.13\n", + " 7.25\n", + " 7.35\n", + " 10.99\n", + " 140.00\n", " 1\n", " \n", " \n", " 1\n", - " 3.59\n", - " 3.33\n", - " 3.35\n", - " 3.42\n", - " 6.60\n", - " 7.54\n", - " 557.32\n", + " 11.27\n", + " 11.23\n", + " 11.28\n", + " 11.53\n", + " 11.57\n", + " 12.05\n", + " 177.41\n", " 2\n", " \n", " \n", " 2\n", - " 4.32\n", - " 3.98\n", - " 4.01\n", - " 4.64\n", - " 7.23\n", - " 8.11\n", - " 926.92\n", + " 21.15\n", + " 21.13\n", + " 21.25\n", + " 21.44\n", + " 21.59\n", + " 22.07\n", + " 189.09\n", " 4\n", " \n", " \n", " 3\n", - " 6.32\n", - " 5.94\n", - " 5.97\n", - " 7.61\n", - " 8.96\n", - " 10.12\n", - " 1266.63\n", + " 42.27\n", + " 42.26\n", + " 42.68\n", + " 42.95\n", + " 43.11\n", + " 45.11\n", + " 189.27\n", " 8\n", " \n", " \n", " 4\n", - " 9.60\n", - " 9.22\n", - " 9.25\n", - " 11.32\n", - " 12.33\n", - " 13.34\n", - " 1666.05\n", + " 83.77\n", + " 83.84\n", + " 84.29\n", + " 84.94\n", + " 85.35\n", + " 86.34\n", + " 191.01\n", " 16\n", " \n", " \n", " 5\n", - " 16.17\n", - " 15.80\n", - " 15.90\n", - " 17.38\n", - " 18.80\n", - " 19.93\n", - " 1979.41\n", + " 168.40\n", + " 169.62\n", + " 170.78\n", + " 171.94\n", + " 172.82\n", + " 174.28\n", + " 190.02\n", " 32\n", " \n", - " \n", - " 6\n", - " 29.26\n", - " 28.89\n", - " 29.01\n", - " 30.63\n", - " 32.53\n", - " 33.28\n", - " 2187.15\n", - " 64\n", - " \n", " \n", "\n", "" ], "text/plain": [ " Latency(ms) Latency_P50 Latency_P75 Latency_P90 Latency_P95 \\\n", - "0 3.00 2.79 2.81 2.86 4.37 \n", - "1 3.59 3.33 3.35 3.42 6.60 \n", - "2 4.32 3.98 4.01 4.64 7.23 \n", - "3 6.32 5.94 5.97 7.61 8.96 \n", - "4 9.60 9.22 9.25 11.32 12.33 \n", - "5 16.17 15.80 15.90 17.38 18.80 \n", - "6 29.26 28.89 29.01 30.63 32.53 \n", + "0 7.14 7.10 7.13 7.25 7.35 \n", + "1 11.27 11.23 11.28 11.53 11.57 \n", + "2 21.15 21.13 21.25 21.44 21.59 \n", + "3 42.27 42.26 42.68 42.95 43.11 \n", + "4 83.77 83.84 84.29 84.94 85.35 \n", + "5 168.40 169.62 170.78 171.94 172.82 \n", "\n", " Latency_P99 Throughput(QPS) batch_size \n", - "0 7.08 333.83 1 \n", - "1 7.54 557.32 2 \n", - "2 8.11 926.92 4 \n", - "3 10.12 1266.63 8 \n", - "4 13.34 1666.05 16 \n", - "5 19.93 1979.41 32 \n", - "6 33.28 2187.15 64 " + "0 10.99 140.00 1 \n", + "1 12.05 177.41 2 \n", + "2 22.07 189.09 4 \n", + "3 45.11 189.27 8 \n", + "4 86.34 191.01 16 \n", + "5 174.28 190.02 32 " ] }, - "execution_count": 26, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } @@ -1481,10 +1303,10 @@ "import glob \n", "import pandas\n", "latest_result_file = max(glob.glob(\"./onnx/perf_results_*.txt\"), key=os.path.getmtime)\n", - "result_data = pandas.read_table(latest_result_file, converters={'OMP_NUM_THREADS': str, 'OMP_WAIT_POLICY':str})\n", + "result_data = pandas.read_table(latest_result_file)\n", "print(\"Float16 model summary from\", latest_result_file)\n", - "columns_to_remove = ['model', 'graph_optimization_level', 'test_cases', 'test_times', 'use_gpu', 'warmup', 'sequence_length']\n", - "columns_to_remove.extend(['intra_op_num_threads', 'OMP_NUM_THREADS', 'OMP_WAIT_POLICY', 'contiguous'])\n", + "columns_to_remove = ['model', 'graph_optimization_level', 'test_cases', 'test_times', 'use_gpu', 'sequence_length']\n", + "columns_to_remove.extend(['intra_op_num_threads'])\n", "result_data.drop(columns_to_remove, axis=1, inplace=True)\n", "result_data" ] @@ -1506,7 +1328,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 25, "metadata": { "scrolled": true }, @@ -1517,42 +1339,126 @@ "text": [ "{\r\n", " \"gpu\": {\r\n", - " \"driver_version\": \"440.64.00\",\r\n", + " \"driver_version\": \"450.51.05\",\r\n", " \"devices\": [\r\n", " {\r\n", - " \"memory_total\": 16945512448,\r\n", - " \"memory_available\": 14110883840,\r\n", - " \"name\": \"Tesla V100-PCIE-16GB\"\r\n", - " },\r\n", - " {\r\n", - " \"memory_total\": 16945512448,\r\n", - " \"memory_available\": 16932601856,\r\n", - " \"name\": \"Tesla V100-PCIE-16GB\"\r\n", + " \"memory_total\": 15843721216,\r\n", + " \"memory_available\": 9313189888,\r\n", + " \"name\": \"Tesla T4\"\r\n", " }\r\n", " ]\r\n", " },\r\n", " \"cpu\": {\r\n", - " \"brand\": \"Intel(R) Xeon(R) CPU E5-2690 v4 @ 2.60GHz\",\r\n", - " \"cores\": 12,\r\n", - " \"logical_cores\": 12,\r\n", - " \"hz\": \"2.5940 GHz\",\r\n", - " \"l2_cache\": \"256 KB\",\r\n", - " \"l3_cache\": \"35840 KB\",\r\n", + " \"brand\": \"AMD EPYC 7V12 64-Core Processor\",\r\n", + " \"cores\": 4,\r\n", + " \"logical_cores\": 4,\r\n", + " \"hz\": [\r\n", + " 2445417000,\r\n", + " 0\r\n", + " ],\r\n", + " \"l2_cache\": 524288,\r\n", + " \"flags\": [\r\n", + " \"3dnowext\",\r\n", + " \"3dnowprefetch\",\r\n", + " \"abm\",\r\n", + " \"adx\",\r\n", + " \"aes\",\r\n", + " \"apic\",\r\n", + " \"arat\",\r\n", + " \"avx\",\r\n", + " \"avx2\",\r\n", + " \"bmi1\",\r\n", + " \"bmi2\",\r\n", + " \"clflush\",\r\n", + " \"clflushopt\",\r\n", + " \"clwb\",\r\n", + " \"cmov\",\r\n", + " \"cmp_legacy\",\r\n", + " \"cpuid\",\r\n", + " \"cr8_legacy\",\r\n", + " \"cx16\",\r\n", + " \"cx8\",\r\n", + " \"de\",\r\n", + " \"extd_apicid\",\r\n", + " \"f16c\",\r\n", + " \"fma\",\r\n", + " \"fpu\",\r\n", + " \"fsgsbase\",\r\n", + " \"fxsr\",\r\n", + " \"fxsr_opt\",\r\n", + " \"ht\",\r\n", + " \"hypervisor\",\r\n", + " \"lahf_lm\",\r\n", + " \"lm\",\r\n", + " \"mca\",\r\n", + " \"mce\",\r\n", + " \"misalignsse\",\r\n", + " \"mmx\",\r\n", + " \"mmxext\",\r\n", + " \"movbe\",\r\n", + " \"msr\",\r\n", + " \"mtrr\",\r\n", + " \"nopl\",\r\n", + " \"nx\",\r\n", + " \"osvw\",\r\n", + " \"osxsave\",\r\n", + " \"pae\",\r\n", + " \"pat\",\r\n", + " \"pclmulqdq\",\r\n", + " \"pdpe1gb\",\r\n", + " \"pge\",\r\n", + " \"pni\",\r\n", + " \"popcnt\",\r\n", + " \"pse\",\r\n", + " \"pse36\",\r\n", + " \"rdpid\",\r\n", + " \"rdrand\",\r\n", + " \"rdrnd\",\r\n", + " \"rdseed\",\r\n", + " \"rdtscp\",\r\n", + " \"rep_good\",\r\n", + " \"sep\",\r\n", + " \"sha\",\r\n", + " \"sha_ni\",\r\n", + " \"smap\",\r\n", + " \"smep\",\r\n", + " \"ssbd\",\r\n", + " \"sse\",\r\n", + " \"sse2\",\r\n", + " \"sse4_1\",\r\n", + " \"sse4_2\",\r\n", + " \"sse4a\",\r\n", + " \"ssse3\",\r\n", + " \"syscall\",\r\n", + " \"topoext\",\r\n", + " \"tsc\",\r\n", + " \"umip\",\r\n", + " \"vme\",\r\n", + " \"vmmcall\",\r\n", + " \"xgetbv1\",\r\n", + " \"xsave\",\r\n", + " \"xsavec\",\r\n", + " \"xsaveerptr\",\r\n", + " \"xsaveopt\",\r\n", + " \"xsaves\"\r\n", + " ],\r\n", " \"processor\": \"x86_64\"\r\n", " },\r\n", " \"memory\": {\r\n", - " \"total\": 236645588992,\r\n", - " \"available\": 222567559168\r\n", + " \"total\": 29450223616,\r\n", + " \"available\": 22402334720\r\n", " },\r\n", - " \"python\": \"3.7.7.final.0 (64 bit)\",\r\n", - " \"os\": \"Linux-4.15.0-1089-azure-x86_64-with-debian-stretch-sid\",\r\n", + " \"python\": \"3.6.13.final.0 (64 bit)\",\r\n", + " \"os\": \"Linux-5.4.0-1046-azure-x86_64-with-debian-buster-sid\",\r\n", " \"onnxruntime\": {\r\n", - " \"version\": \"1.3.0\",\r\n", + " \"version\": \"1.8.1\",\r\n", " \"support_gpu\": true\r\n", " },\r\n", + " \"onnxruntime_tools\": null,\r\n", " \"pytorch\": {\r\n", - " \"version\": \"1.5.0\",\r\n", - " \"support_gpu\": true\r\n", + " \"version\": \"1.9.0+cu111\",\r\n", + " \"support_gpu\": true,\r\n", + " \"cuda\": \"11.1\"\r\n", " },\r\n", " \"tensorflow\": null\r\n", "}\r\n" @@ -1560,15 +1466,15 @@ } ], "source": [ - "!{sys.executable} -m onnxruntime_tools.transformers.machine_info --silent" + "!{sys.executable} -m onnxruntime.transformers.machine_info --silent" ] } ], "metadata": { "kernelspec": { - "display_name": "gpu_env_py37", + "display_name": "gpu_env", "language": "python", - "name": "gpu_env_py37" + "name": "gpu_env" }, "language_info": { "codemirror_mode": { @@ -1580,7 +1486,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.7" + "version": "3.6.13" } }, "nbformat": 4,