diff --git a/onnxruntime/python/tools/bert/bert_model_optimization.py b/onnxruntime/python/tools/bert/bert_model_optimization.py index 4800d42f4b..bde7c765ec 100644 --- a/onnxruntime/python/tools/bert/bert_model_optimization.py +++ b/onnxruntime/python/tools/bert/bert_model_optimization.py @@ -30,7 +30,6 @@ import argparse import numpy as np from collections import deque from onnx import ModelProto, TensorProto, numpy_helper -import onnxruntime from BertOnnxModel import BertOnnxModel from BertOnnxModelTF import BertOnnxModelTF from BertOnnxModelKeras import BertOnnxModelKeras @@ -56,6 +55,8 @@ def optimize_by_onnxruntime(onnx_model_path, use_gpu, optimized_model_path=None) Returns: optimized_model_path: the path of optimized model """ + import onnxruntime + if use_gpu and 'CUDAExecutionProvider' not in onnxruntime.get_available_providers(): logger.error("There is no gpu for onnxruntime to do optimization.") return onnx_model_path diff --git a/onnxruntime/python/tools/bert/notebooks/Tensorflow_Keras_Bert-Squad_OnnxRuntime_CPU.ipynb b/onnxruntime/python/tools/bert/notebooks/Tensorflow_Keras_Bert-Squad_OnnxRuntime_CPU.ipynb new file mode 100644 index 0000000000..d1b0773cf7 --- /dev/null +++ b/onnxruntime/python/tools/bert/notebooks/Tensorflow_Keras_Bert-Squad_OnnxRuntime_CPU.ipynb @@ -0,0 +1,1014 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Copyright (c) Microsoft Corporation. All rights reserved. \n", + "Licensed under the MIT License." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Inference TensorFlow Bert Model with ONNX Runtime on CPU" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this tutorial, you'll be introduced to how to load a Bert model using TensorFlow, convert it to ONNX using Keras2onnx, and inference it for high performance using ONNX Runtime. In the following sections, we are going to use the Bert model trained with Stanford Question Answering Dataset (SQuAD) dataset as an example. Bert SQuAD model is used in question answering scenarios, where the answer to every question is a segment of text, or span, from the corresponding reading passage, or the question might be unanswerable." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 0. Prerequisites ##\n", + "First we need a python environment before running this notebook.\n", + "\n", + "You can install [AnaConda](https://www.anaconda.com/distribution/) and [Git](https://git-scm.com/downloads) and open an AnaConda console when it is done. Then you can run the following commands to create a conda environment named cpu_env:\n", + "\n", + "```console\n", + "conda create -n cpu_env python=3.6\n", + "conda activate cpu_env\n", + "\n", + "conda install -c anaconda ipykernel\n", + "conda install -c conda-forge ipywidgets\n", + "python -m ipykernel install --user --name=cpu_env\n", + "```\n", + "\n", + "Finally, launch Jupyter Notebook and you can choose cpu_env as kernel to run this notebook.\n", + "\n", + "Let's install [Tensorflow](https://www.tensorflow.org/install), [OnnxRuntime](https://microsoft.github.io/onnxruntime/), Keras2Onnx and other packages like the following:" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "import sys\n", + "\n", + "!{sys.executable} -m pip install --quiet --upgrade tensorflow==2.1.0\n", + "!{sys.executable} -m pip install --quiet --upgrade onnxruntime\n", + "\n", + "# Install keras2onnx from source, since the latest package (1.6.0) does not support bert models from tensorflow 2.1 currently.\n", + "!{sys.executable} -m pip install --quiet git+https://github.com/microsoft/onnxconverter-common\n", + "!{sys.executable} -m pip install --quiet git+https://github.com/onnx/keras-onnx\n", + " \n", + "# Install other packages used in this notebook. \n", + "!{sys.executable} -m pip install --quiet transformers==2.5.1\n", + "!{sys.executable} -m pip install --quiet wget psutil onnx pytz pandas py-cpuinfo py3nvml" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# Whether allow overwrite existing script or model.\n", + "enable_overwrite = True\n", + "\n", + "# Number of runs to get average latency.\n", + "total_runs = 100" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "100% [..............................................................................] 15310 / 15310Downloaded bert_perf_test.py\n", + "100% [................................................................................] 9571 / 9571Downloaded bert_test_data.py\n", + "100% [................................................................................] 7272 / 7272Downloaded compare_bert_results.py\n", + "100% [..............................................................................] 44905 / 44905Downloaded BertOnnxModel.py\n", + "100% [..............................................................................] 21565 / 21565Downloaded BertOnnxModelKeras.py\n", + "100% [..............................................................................] 26114 / 26114Downloaded BertOnnxModelTF.py\n", + "100% [..............................................................................] 22773 / 22773Downloaded OnnxModel.py\n", + "100% [................................................................................] 7795 / 7795Downloaded bert_model_optimization.py\n" + ] + } + ], + "source": [ + "import os\n", + "import wget\n", + "\n", + "cache_dir = \"./squad\"\n", + "output_dir = \"./output\"\n", + "script_dir = './bert_scripts'\n", + "\n", + "for directory in [cache_dir, output_dir, script_dir]:\n", + " if not os.path.exists(directory):\n", + " os.makedirs(directory)\n", + "\n", + "# Download scripts for BERT optimization.\n", + "url_prfix = \"https://raw.githubusercontent.com/microsoft/onnxruntime/master/onnxruntime/python/tools/bert/\"\n", + "script_files = ['bert_perf_test.py', 'bert_test_data.py', 'compare_bert_results.py', 'BertOnnxModel.py', 'BertOnnxModelKeras.py', 'BertOnnxModelTF.py', 'OnnxModel.py', 'bert_model_optimization.py']\n", + "\n", + "for filename in script_files:\n", + " target_file = os.path.join(script_dir, filename)\n", + " if enable_overwrite and os.path.exists(target_file):\n", + " os.remove(target_file)\n", + " if not os.path.exists(target_file):\n", + " wget.download(url_prfix + filename, target_file)\n", + " print(\"Downloaded\", filename)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Load Pretrained Bert model ##" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Start to load fine-tuned model. This step take a few minutes to download the model (1.3 GB) for the first time." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "from transformers import (TFBertForQuestionAnswering, BertTokenizer)\n", + "\n", + "model_name_or_path = 'bert-large-uncased-whole-word-masking-finetuned-squad'\n", + "\n", + "# Load model and tokenizer\n", + "tokenizer = BertTokenizer.from_pretrained(model_name_or_path, do_lower_case=True, cache_dir=cache_dir)\n", + "model = TFBertForQuestionAnswering.from_pretrained(model_name_or_path, cache_dir=cache_dir)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. TensorFlow Inference\n", + "\n", + "Use one example to run inference using TensorFlow as baseline." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The answer is: a performance - focused inference engine for on ##nx models\n" + ] + } + ], + "source": [ + "import numpy\n", + "question, text = \"What is ONNX Runtime?\", \"ONNX Runtime is a performance-focused inference engine for ONNX models.\"\n", + "inputs = tokenizer.encode_plus(question, text, add_special_tokens=True, return_tensors='tf')\n", + "\n", + "start_scores, end_scores = model(inputs)\n", + "\n", + "num_tokens = len(inputs[\"input_ids\"][0])\n", + "all_tokens = tokenizer.convert_ids_to_tokens(inputs[\"input_ids\"][0])\n", + "print(\"The answer is:\", ' '.join(all_tokens[numpy.argmax(start_scores) : numpy.argmax(end_scores)+1]))" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Tensorflow Inference time for sequence length 26 = 227.06 ms\n" + ] + } + ], + "source": [ + "import time\n", + "start = time.time()\n", + "for _ in range(total_runs):\n", + " start_scores, end_scores = model(inputs)\n", + "end = time.time()\n", + "print(\"Tensorflow Inference time for sequence length {} = {} ms\".format(num_tokens, format((end - start) * 1000 / total_runs, '.2f')))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. Export model to ONNX using Keras2onnx\n", + "\n", + "Now we use Keras2onnx to export the model to ONNX format. It takes about 18 minutes for the large model." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "The node number after optimization: 5257 -> 3836\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Keras2onnx run time = 1052.26 s\n" + ] + } + ], + "source": [ + "import keras2onnx\n", + "\n", + "output_model_path = os.path.join(output_dir, 'keras_{}.onnx'.format(model_name_or_path))\n", + "\n", + "if enable_overwrite or not os.path.exists(output_model_path):\n", + " model.predict(inputs)\n", + " start = time.time()\n", + " onnx_model = keras2onnx.convert_keras(model, model.name)\n", + " keras2onnx.save_model(onnx_model, output_model_path)\n", + " print(\"Keras2onnx run time = {} s\".format(format(time.time() - start, '.2f')))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4. Inference the Exported Model with ONNX Runtime" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### OpenMP Environment Variable\n", + "\n", + "OpenMP environment variable is important for CPU inference of Bert models. After running this notebook, you can find the best setting from [Performance Test Tool](#Performance-Test-Tool) result for your machine.\n", + "\n", + "Setting environment variables shall be done before importing onnxruntime. Otherwise, they might not take effect." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import psutil\n", + "\n", + "# You may change the settings in this cell according to Performance Test Tool result after running the whole notebook.\n", + "use_openmp = True\n", + "\n", + "# ATTENTION: these environment variables must be set before importing onnxruntime.\n", + "if use_openmp:\n", + " os.environ[\"OMP_NUM_THREADS\"] = str(psutil.cpu_count(logical=True))\n", + "else:\n", + " os.environ[\"OMP_NUM_THREADS\"] = '1'\n", + "\n", + "os.environ[\"OMP_WAIT_POLICY\"] = 'ACTIVE'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we are ready to inference the model with ONNX Runtime. Here we can see that OnnxRuntime has better performance than TensorFlow for this example even without optimization." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ONNX Runtime cpu inference time for sequence length 26 (model not optimized): 170.70 ms\n" + ] + } + ], + "source": [ + "import psutil\n", + "import onnxruntime\n", + "import numpy\n", + "\n", + "# User might use onnxruntime-gpu for CPU inference.\n", + "if use_openmp and 'CUDAExecutionProvider' in onnxruntime.get_available_providers():\n", + " print(\"warning: onnxruntime-gpu is not built with OpenMP. You might try onnxruntime package.\")\n", + " \n", + "sess_options = onnxruntime.SessionOptions()\n", + "\n", + "# The following settings enables OpenMP, which is required to get best performance for CPU inference of Bert models.\n", + "if use_openmp:\n", + " sess_options.intra_op_num_threads=1\n", + "else:\n", + " sess_options.intra_op_num_threads=psutil.cpu_count(logical=True)\n", + "\n", + "# Providers is optional. Only needed when you use onnxruntime-gpu for CPU inference.\n", + "session = onnxruntime.InferenceSession(output_model_path, sess_options, providers=['CPUExecutionProvider'])\n", + "\n", + "# Use contiguous array as input could improve performance.\n", + "inputs_onnx = {k_: numpy.ascontiguousarray(v_.numpy()) for k_, v_ in inputs.items()}\n", + "\n", + "# Warm up with one run.\n", + "results = session.run(None, inputs_onnx)\n", + "\n", + "# Measure the latency.\n", + "start = time.time()\n", + "for _ in range(total_runs):\n", + " results = session.run(None, inputs_onnx)\n", + "end = time.time()\n", + "print(\"ONNX Runtime cpu inference time for sequence length {} (model not optimized): {} ms\".format(num_tokens, format((end - start) * 1000 / total_runs, '.2f')))\n", + "del session" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "***** Verifying correctness (TensorFlow and ONNX Runtime) *****\n", + "WARNING:tensorflow:From :2: _EagerTensorBase.cpu (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.\n", + "Instructions for updating:\n", + "Use tf.identity instead.\n", + "start_scores are close: True\n", + "end_scores are close: True\n" + ] + } + ], + "source": [ + "print(\"***** Verifying correctness (TensorFlow and ONNX Runtime) *****\")\n", + "print('start_scores are close:', numpy.allclose(results[0], start_scores.cpu(), rtol=1e-05, atol=1e-04))\n", + "print('end_scores are close:', numpy.allclose(results[1], end_scores.cpu(), rtol=1e-05, atol=1e-04))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5. Model Optimization\n", + "\n", + "[ONNX Runtime BERT Model Optimization Tools](https://github.com/microsoft/onnxruntime/tree/master/onnxruntime/python/tools/bert) is a set of tools for optimizing and testing BERT models. Let's try some of them on the exported models." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### BERT Optimization Script\n", + "\n", + "The script **bert_model_optimization.py** can help optimize BERT model exported by PyTorch, tf2onnx or keras2onnx. Since our model is exported by keras2onnx, we shall use **--model_type bert_keras** parameter.\n", + "\n", + "It will also tell whether the model is fully optimized or not. If not, that means you might need change the script to fuse some new pattern of subgraph." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " BertOnnxModelTF.py: Fused LayerNormalization count: 49\n", + "BertOnnxModelKeras.py: Fused Gelu count:24\n", + "BertOnnxModelKeras.py: start processing embedding layer...\n", + "BertOnnxModelKeras.py: Found word embedding. name:tf_bert_for_question_answering/bert/embeddings/Gather/resource:0, shape:(30522, 1024)\n", + "BertOnnxModelKeras.py: Found word embedding. name:tf_bert_for_question_answering/bert/embeddings/position_embeddings/embedding_lookup/413066:0, shape:(512, 1024)\n", + "BertOnnxModelKeras.py: Found segment embedding. name:tf_bert_for_question_answering/bert/embeddings/token_type_embeddings/embedding_lookup/413071:0, shape:(2, 1024)\n", + "BertOnnxModelKeras.py: Create Embedding node\n", + " OnnxModel.py: Graph pruned: 0 inputs, 0 outputs and 9 nodes are removed\n", + "BertOnnxModelKeras.py: Fused mask\n", + "BertOnnxModelKeras.py: Skip consequent Reshape count: 24\n", + " BertOnnxModel.py: Fused Reshape count:0\n", + " BertOnnxModel.py: Fused SkipLayerNormalization count: 48\n", + "BertOnnxModelKeras.py: Fused Attention count:24\n", + " BertOnnxModel.py: Fused SkipLayerNormalization with Bias count:24\n", + "BertOnnxModelKeras.py: Remove 96 Reshape nodes.\n", + " OnnxModel.py: Graph pruned: 0 inputs, 0 outputs and 2160 nodes are removed\n", + " BertOnnxModel.py: opset verion: 11\n", + " OnnxModel.py: Output model to ./output\\keras_bert_large_opt_cpu.onnx\n", + " BertOnnxModel.py: EmbedLayer=1, Attention=24, Gelu=24, LayerNormalization=48, Succesful=True\n", + "bert_model_optimization.py: The output model is fully optimized.\n" + ] + } + ], + "source": [ + "optimized_model_path = os.path.join(output_dir, 'keras_bert_large_opt_cpu.onnx')\n", + "\n", + "%run bert_scripts/bert_model_optimization.py --input $output_model_path --output $optimized_model_path --model_type bert_keras --num_heads 16 --hidden_size 1024" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We run the optimized model using same inputs. The inference latency is reduced after optimization. The output result is the same as the one before optimization." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ONNX Runtime cpu inference time on optimized model: 133.35 ms\n" + ] + } + ], + "source": [ + "session = onnxruntime.InferenceSession(optimized_model_path, sess_options)\n", + "# use one run to warm up a session\n", + "session.run(None, inputs_onnx)\n", + "\n", + "# measure the latency.\n", + "start = time.time()\n", + "for _ in range(total_runs):\n", + " opt_results = session.run(None, inputs_onnx)\n", + "end = time.time()\n", + "print(\"ONNX Runtime cpu inference time on optimized model: {} ms\".format(format((end - start) * 1000 / total_runs, '.2f')))\n", + "del session" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "***** Verifying correctness (before and after optimization) *****\n", + "start_scores are close: True\n", + "end_scores are close: True\n" + ] + } + ], + "source": [ + "print(\"***** Verifying correctness (before and after optimization) *****\")\n", + "print('start_scores are close:', numpy.allclose(opt_results[0], start_scores.cpu(), rtol=1e-05, atol=1e-04))\n", + "print('end_scores are close:', numpy.allclose(opt_results[1], end_scores.cpu(), rtol=1e-05, atol=1e-04))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Model Results Comparison Tool\n", + "\n", + "If your BERT model has three inputs, a script compare_bert_results.py can be used to do a quick verification. The tool will generate some fake input data, and compare results from both the original and optimized models. If outputs are all close, it is safe to use the optimized model.\n", + "\n", + "Example of comparing the models before and after optimization:" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "100% passed for 10 random inputs given thresholds (rtol=0.001, atol=0.0001).\n", + "maximum absolute difference=2.3484230041503906e-05\n", + "maximum relative difference=0.00013404049968812615\n" + ] + } + ], + "source": [ + "# The base model is exported using sequence length 26\n", + "%run ./bert_scripts/compare_bert_results.py --baseline_model $output_model_path --optimized_model $optimized_model_path --batch_size 1 --sequence_length 26 --samples 10" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Performance Test Tool\n", + "\n", + "This tool measures performance of BERT model inference using OnnxRuntime Python API.\n", + "\n", + "The following command will create 100 samples of batch_size 1 and sequence length 128 to run inference, then calculate performance numbers like average latency and throughput etc. \n", + "\n", + "It takes about 20 minutes to run this test. You can remove --all to reduce number of settings in the test." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Generating 100 samples for batch_size=1 sequence_length=128\n", + "Extra latency for converting inputs to contiguous: 0.04 ms\n", + "Test summary is saved to output\\perf_results_CPU_B1_S128_20200319-141051.txt\n" + ] + } + ], + "source": [ + "%run ./bert_scripts/bert_perf_test.py --model $optimized_model_path --batch_size 1 --sequence_length 128 --samples 100 --test_times 1 --inclusive --all" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's load the summary file and take a look. In this machine, the best result is achieved by OpenMP. The best setting might be difference using different hardware or model." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "./output\\perf_results_CPU_B1_S128_20200319-141051.txt\n", + "The best setting is: use openmp; NO contiguous array\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
intra_op_num_threadsOMP_NUM_THREADSOMP_WAIT_POLICYcontiguousLatency(ms)Latency_P99Throughput(QPS)
0112PASSIVEFalse254.20277.383.93
1112PASSIVETrue255.47283.563.91
2112ACTIVEFalse274.95334.493.64
3121ACTIVETrue278.91294.303.59
4112ACTIVETrue280.97351.413.56
5121ACTIVEFalse281.89296.113.55
6121PASSIVETrue282.75313.103.54
71False284.61356.983.51
81True292.11361.003.42
916PASSIVEFalse292.28346.473.42
10121PASSIVEFalse292.66346.733.42
1116PASSIVETrue310.60443.163.22
1216ACTIVETrue338.61402.032.95
1361PASSIVETrue362.84378.082.76
1461ACTIVETrue362.96372.772.76
1561ACTIVEFalse363.42392.192.75
1661PASSIVEFalse363.76385.012.75
1716ACTIVEFalse367.37423.172.72
1866ACTIVEFalse372.56484.902.68
1966PASSIVETrue383.58408.132.61
2066PASSIVEFalse384.07393.322.60
2166ACTIVETrue388.30647.232.58
220True423.20465.312.36
230False448.80550.752.23
\n", + "
" + ], + "text/plain": [ + " intra_op_num_threads OMP_NUM_THREADS OMP_WAIT_POLICY contiguous \\\n", + "0 1 12 PASSIVE False \n", + "1 1 12 PASSIVE True \n", + "2 1 12 ACTIVE False \n", + "3 12 1 ACTIVE True \n", + "4 1 12 ACTIVE True \n", + "5 12 1 ACTIVE False \n", + "6 12 1 PASSIVE True \n", + "7 1 False \n", + "8 1 True \n", + "9 1 6 PASSIVE False \n", + "10 12 1 PASSIVE False \n", + "11 1 6 PASSIVE True \n", + "12 1 6 ACTIVE True \n", + "13 6 1 PASSIVE True \n", + "14 6 1 ACTIVE True \n", + "15 6 1 ACTIVE False \n", + "16 6 1 PASSIVE False \n", + "17 1 6 ACTIVE False \n", + "18 6 6 ACTIVE False \n", + "19 6 6 PASSIVE True \n", + "20 6 6 PASSIVE False \n", + "21 6 6 ACTIVE True \n", + "22 0 True \n", + "23 0 False \n", + "\n", + " Latency(ms) Latency_P99 Throughput(QPS) \n", + "0 254.20 277.38 3.93 \n", + "1 255.47 283.56 3.91 \n", + "2 274.95 334.49 3.64 \n", + "3 278.91 294.30 3.59 \n", + "4 280.97 351.41 3.56 \n", + "5 281.89 296.11 3.55 \n", + "6 282.75 313.10 3.54 \n", + "7 284.61 356.98 3.51 \n", + "8 292.11 361.00 3.42 \n", + "9 292.28 346.47 3.42 \n", + "10 292.66 346.73 3.42 \n", + "11 310.60 443.16 3.22 \n", + "12 338.61 402.03 2.95 \n", + "13 362.84 378.08 2.76 \n", + "14 362.96 372.77 2.76 \n", + "15 363.42 392.19 2.75 \n", + "16 363.76 385.01 2.75 \n", + "17 367.37 423.17 2.72 \n", + "18 372.56 484.90 2.68 \n", + "19 383.58 408.13 2.61 \n", + "20 384.07 393.32 2.60 \n", + "21 388.30 647.23 2.58 \n", + "22 423.20 465.31 2.36 \n", + "23 448.80 550.75 2.23 " + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import glob \n", + "import pandas\n", + "\n", + "latest_result_file = max(glob.glob(os.path.join(output_dir, \"perf_results_*.txt\")), key=os.path.getmtime)\n", + "result_data = pandas.read_table(latest_result_file, converters={'OMP_NUM_THREADS': str, 'OMP_WAIT_POLICY':str})\n", + "print(latest_result_file)\n", + "print(\"The best setting is: {} openmp; {} contiguous array\".format('use' if result_data['intra_op_num_threads'].iloc[0] == 1 else 'NO', 'use' if result_data['contiguous'].iloc[0] else 'NO'))\n", + "\n", + "result_data.drop(['model', 'graph_optimization_level', 'batch_size', 'sequence_length', 'test_cases', 'test_times', 'use_gpu', 'warmup'], axis=1, inplace=True)\n", + "result_data.drop(['Latency_P50', 'Latency_P75', 'Latency_P90', 'Latency_P95'], axis=1, inplace=True)\n", + "cols = result_data.columns.tolist()\n", + "cols = cols[-4:] + cols[:-4]\n", + "result_data = result_data[cols]\n", + "result_data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 6. Additional Info\n", + "\n", + "Note that running Jupyter Notebook has slight impact on performance result since Jupyter Notebook is using system resources like CPU and memory etc. It is recommended to close Jupyter Notebook and other applications, then run the performance test tool in a console to get more accurate performance numbers.\n", + "\n", + "[OnnxRuntime C API](https://github.com/microsoft/onnxruntime/blob/master/docs/C_API.md) could get slightly better performance than python API. If you use C API in inference, you can use OnnxRuntime_Perf_Test.exe built from source to measure performance instead.\n", + "\n", + "Here is the machine configuration that generated the above results. The machine has GPU but not used in CPU inference.\n", + "You might get slower or faster result based on your hardware." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"gpu\": {\n", + " \"driver_version\": \"441.22\",\n", + " \"devices\": [\n", + " {\n", + " \"memory_total\": 8589934592,\n", + " \"memory_available\": 611880960,\n", + " \"name\": \"GeForce GTX 1070\"\n", + " }\n", + " ]\n", + " },\n", + " \"cpu\": {\n", + " \"brand\": \"Intel(R) Core(TM) i7-8700 CPU @ 3.20GHz\",\n", + " \"cores\": 6,\n", + " \"logical_cores\": 12,\n", + " \"hz\": \"3.1920 GHz\",\n", + " \"l2_cache\": \"1536 KB\",\n", + " \"l3_cache\": \"12288 KB\",\n", + " \"processor\": \"Intel64 Family 6 Model 158 Stepping 10, GenuineIntel\"\n", + " },\n", + " \"memory\": {\n", + " \"total\": 16971259904,\n", + " \"available\": 6245142528\n", + " },\n", + " \"python\": \"3.6.10.final.0 (64 bit)\",\n", + " \"os\": \"Windows-10-10.0.18362-SP0\",\n", + " \"onnxruntime\": {\n", + " \"version\": \"1.2.0\",\n", + " \"support_gpu\": false\n", + " },\n", + " \"pytorch\": {\n", + " \"version\": \"1.4.0+cpu\",\n", + " \"support_gpu\": false\n", + " },\n", + " \"tensorflow\": {\n", + " \"version\": \"2.1.0\",\n", + " \"git_version\": \"v2.1.0-rc2-17-ge5bf8de410\",\n", + " \"support_gpu\": true\n", + " }\n", + "}\n" + ] + } + ], + "source": [ + "%run ./bert_scripts/MachineInfo.py --silent" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "cpu_env", + "language": "python", + "name": "cpu_env" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.10" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}