From 3a1e48dd5ad2a9ef95b6bc137441a4df6b05e5d0 Mon Sep 17 00:00:00 2001 From: Tianlei Wu Date: Thu, 14 Sep 2023 18:15:29 -0700 Subject: [PATCH] update BERT notebook with ORT 1.16 (#17524) - Update BERT notebook with onnxruntime-gpu 1.16 - Add example of packing mode - Run results in RTX 4090 GPU --- .../PyTorch_Bert-Squad_OnnxRuntime_GPU.ipynb | 1665 +++++++++++------ 1 file changed, 1072 insertions(+), 593 deletions(-) diff --git a/onnxruntime/python/tools/transformers/notebooks/PyTorch_Bert-Squad_OnnxRuntime_GPU.ipynb b/onnxruntime/python/tools/transformers/notebooks/PyTorch_Bert-Squad_OnnxRuntime_GPU.ipynb index 74b81fc7c8..43c31e1ea4 100644 --- a/onnxruntime/python/tools/transformers/notebooks/PyTorch_Bert-Squad_OnnxRuntime_GPU.ipynb +++ b/onnxruntime/python/tools/transformers/notebooks/PyTorch_Bert-Squad_OnnxRuntime_GPU.ipynb @@ -33,19 +33,20 @@ "\n", "#### GPU Environment Setup using AnaConda\n", "\n", - "First, we install [AnaConda](https://www.anaconda.com/distribution/) in a target machine and open an AnaConda prompt window when it is done. Then run the following commands to create a conda environment. This notebook is tested with PyTorch 1.5.0 and OnnxRuntime 1.3.0.\n", + "First, we install [AnaConda](https://www.anaconda.com/distribution/) in a target machine and open an AnaConda prompt window when it is done. Then run the following commands to create a conda environment. This notebook is tested with PyTorch 2.0.1 and OnnxRuntime 1.16.0.\n", "\n", "```console\n", - "conda create -n gpu_env python=3.6\n", + "conda create -n gpu_env python=3.10\n", "conda activate gpu_env\n", - "conda install -c anaconda ipykernel\n", + "pip install jupyterlab\n", + "conda install ipykernel\n", "conda install -c conda-forge ipywidgets\n", - "python -m ipykernel install --user --name=gpu_env\n", - "jupyter notebook\n", + "ipython kernel install --user --name gpu_env\n", + "jupyter-lab\n", "```\n", "Finally, launch Jupyter Notebook and you can choose gpu_env as kernel to run this notebook.\n", "\n", - "Onnxruntime-gpu need specified version of CUDA and cuDNN. You can find the Requirements [here](https://onnxruntime.ai/docs/install/). Remember to add the directories to PATH environment variable (See [CUDA and cuDNN Path](#CUDA-and-cuDNN-Path) below)." + "Onnxruntime-gpu need specified version of CUDA and cuDNN. You can find the Requirements [here](https://onnxruntime.ai/docs/execution-providers/CUDA-ExecutionProvider.html#requirements). Remember to add the directories to PATH environment variable (See [CUDA and cuDNN Path](#CUDA-and-cuDNN-Path) below)." ] }, { @@ -56,18 +57,19 @@ "source": [ "import sys\n", "\n", - "run_install = False # Only need install once\n", - "if run_install:\n", - " if sys.platform in ['linux', 'win32']: # Linux or Windows\n", - " !{sys.executable} -m pip install torch==1.9.0+cu111 torchvision==0.10.0+cu111 torchaudio===0.9.0 -f https://download.pytorch.org/whl/torch_stable.html\n", - " else: # Mac\n", - " print(\"PyTorch 1.9 MacOS Binaries do not support CUDA, install from source instead\")\n", - "\n", - " !{sys.executable} -m pip install onnxruntime-gpu==1.8.1 onnx==1.9.0 onnxconverter_common==1.8.1\n", - "\n", - " # Install other packages used in this notebook.\n", - " !{sys.executable} -m pip install transformers==4.8.2\n", - " !{sys.executable} -m pip install psutil pytz pandas py-cpuinfo py3nvml coloredlogs wget netron sympy" + "if sys.platform in ['linux', 'win32']: # Linux or Windows\n", + " !{sys.executable} -m pip install torch --index-url https://download.pytorch.org/whl/cu118 -q\n", + " !{sys.executable} -m pip install onnxruntime-gpu onnx transformers psutil pandas py-cpuinfo py3nvml coloredlogs wget netron sympy protobuf==3.20.3 -q\n", + "else: # Mac\n", + " print(\"CUDA is not available on MacOS\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### CUDA and cuDNN Path\n", + "onnxruntime-gpu has dependency on [CUDA](https://developer.nvidia.com/cuda-downloads) and [cuDNN](https://developer.nvidia.com/cudnn). Required CUDA version can be found [here](https://onnxruntime.ai/docs/execution-providers/CUDA-ExecutionProvider.html#requirements) If you import torch before onnxruntime, onnxruntime might use the CUDA and cuDNN DLLs that loaded by PyTorch." ] }, { @@ -79,10 +81,10 @@ "name": "stdout", "output_type": "stream", "text": [ - "pytorch: 1.9.0+cu111\n", - "onnxruntime: 1.8.1\n", - "onnx: 1.9.0\n", - "transformers: 4.8.2\n" + "pytorch: 2.0.1+cu118\n", + "onnxruntime: 1.16.0\n", + "onnx: 1.14.1\n", + "transformers: 4.33.1\n" ] } ], @@ -191,9 +193,12 @@ "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 48/48 [00:03<00:00, 14.24it/s]\n", - "convert squad examples to features: 100%|██████████| 1000/1000 [00:08<00:00, 112.67it/s]\n", - "add example index and unique id: 100%|██████████| 1000/1000 [00:00<00:00, 836518.55it/s]\n" + "Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']\n", + "- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", + "- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n", + "100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 48/48 [00:02<00:00, 16.27it/s]\n", + "convert squad examples to features: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:03<00:00, 256.11it/s]\n", + "add example index and unique id: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Latency(ms)Latency_P50Latency_P75Latency_P90Latency_P95Latency_P99Throughput(QPS)intra_op_num_threads
03.193.163.213.273.353.52313.241
13.203.173.223.253.343.50312.808
23.203.153.253.293.363.58312.5115
33.203.183.213.263.353.53312.4914
43.203.163.253.293.403.56312.2413
53.203.193.223.273.353.48312.2012
63.213.183.233.283.373.51311.7324
73.213.193.233.273.343.52311.579
83.213.183.263.313.363.54311.1532
93.213.173.243.283.343.52311.105
103.213.193.253.293.333.54311.102
113.223.193.253.293.363.51310.9310
123.223.193.263.293.403.55310.303
133.233.193.263.323.423.58310.0211
143.233.193.263.303.363.54310.024
153.233.203.233.273.353.60309.537
163.233.193.223.263.333.68309.276
\n", + "" + ], + "text/plain": [ + " Latency(ms) Latency_P50 Latency_P75 Latency_P90 Latency_P95 \\\n", + "0 3.19 3.16 3.21 3.27 3.35 \n", + "1 3.20 3.17 3.22 3.25 3.34 \n", + "2 3.20 3.15 3.25 3.29 3.36 \n", + "3 3.20 3.18 3.21 3.26 3.35 \n", + "4 3.20 3.16 3.25 3.29 3.40 \n", + "5 3.20 3.19 3.22 3.27 3.35 \n", + "6 3.21 3.18 3.23 3.28 3.37 \n", + "7 3.21 3.19 3.23 3.27 3.34 \n", + "8 3.21 3.18 3.26 3.31 3.36 \n", + "9 3.21 3.17 3.24 3.28 3.34 \n", + "10 3.21 3.19 3.25 3.29 3.33 \n", + "11 3.22 3.19 3.25 3.29 3.36 \n", + "12 3.22 3.19 3.26 3.29 3.40 \n", + "13 3.23 3.19 3.26 3.32 3.42 \n", + "14 3.23 3.19 3.26 3.30 3.36 \n", + "15 3.23 3.20 3.23 3.27 3.35 \n", + "16 3.23 3.19 3.22 3.26 3.33 \n", + "\n", + " Latency_P99 Throughput(QPS) intra_op_num_threads \n", + "0 3.52 313.24 1 \n", + "1 3.50 312.80 8 \n", + "2 3.58 312.51 15 \n", + "3 3.53 312.49 14 \n", + "4 3.56 312.24 13 \n", + "5 3.48 312.20 12 \n", + "6 3.51 311.73 24 \n", + "7 3.52 311.57 9 \n", + "8 3.54 311.15 32 \n", + "9 3.52 311.10 5 \n", + "10 3.54 311.10 2 \n", + "11 3.51 310.93 10 \n", + "12 3.55 310.30 3 \n", + "13 3.58 310.02 11 \n", + "14 3.54 310.02 4 \n", + "15 3.60 309.53 7 \n", + "16 3.68 309.27 6 " + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def load_last_perf_test_result():\n", + " import os\n", + " import glob \n", + " import pandas\n", + " latest_result_file = max(glob.glob(\"./onnx/perf_results_*.txt\"), key=os.path.getmtime)\n", + " result_data = pandas.read_table(latest_result_file)\n", + " print(\"Perf results from\", latest_result_file)\n", + " # Do not show columns that have same values for all rows.\n", + " columns_to_remove = ['model', 'graph_optimization_level', 'batch_size', 'sequence_length', 'test_cases', 'test_times', 'use_gpu', 'use_io_binding', 'average_sequence_length', 'random_sequence_length']\n", + " result_data.drop(columns_to_remove, axis=1, inplace=True)\n", + " return result_data\n", + " \n", + "thread_results = load_last_perf_test_result()\n", + "thread_results" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "From above result, we can see that latency is very close for different settings of intra_op_num_threads.\n", + "\n", + "### Model Results Comparison Tool\n", + "\n", + "When a BERT model is optimized, some approximation is used in calculation. If your BERT model has three inputs, a script compare_bert_results.py can be used to do a quick verification. The tool will generate some fake input data, and compare the inference outputs of the original and optimized models. If outputs are all close, it is safe to use the optimized model.\n", + "\n", + "For GPU inference, the absolute or relative difference is larger than those numbers of CPU inference. Note that slight difference in output will not impact final result. We did end-to-end evaluation using SQuAD data set using a fine-tuned squad model, and F1 score is almost the same before/after optimization." ] }, { @@ -761,151 +1070,14 @@ "name": "stdout", "output_type": "stream", "text": [ - "Float32 model perf results from ./onnx/perf_results_GPU_B1_S128_20210714-001817.txt\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Latency(ms)Latency_P50Latency_P75Latency_P90Latency_P95Latency_P99Throughput(QPS)intra_op_num_threads
023.7223.7223.8723.9924.1124.3742.154
124.2424.2424.4224.6024.7625.2341.253
224.3624.3624.4724.6925.0126.5241.052
324.3924.3724.4724.6524.7325.1241.011
\n", - "
" - ], - "text/plain": [ - " Latency(ms) Latency_P50 Latency_P75 Latency_P90 Latency_P95 \\\n", - "0 23.72 23.72 23.87 23.99 24.11 \n", - "1 24.24 24.24 24.42 24.60 24.76 \n", - "2 24.36 24.36 24.47 24.69 25.01 \n", - "3 24.39 24.37 24.47 24.65 24.73 \n", - "\n", - " Latency_P99 Throughput(QPS) intra_op_num_threads \n", - "0 24.37 42.15 4 \n", - "1 25.23 41.25 3 \n", - "2 26.52 41.05 2 \n", - "3 25.12 41.01 1 " - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import os\n", - "import glob \n", - "import pandas\n", - "latest_result_file = max(glob.glob(\"./onnx/perf_results_GPU_B1_S128_*.txt\"), key=os.path.getmtime)\n", - "result_data = pandas.read_table(latest_result_file)\n", - "print(\"Float32 model perf results from\", latest_result_file)\n", - "# Remove some columns that have same values for all rows.\n", - "columns_to_remove = ['model', 'graph_optimization_level', 'batch_size', 'sequence_length', 'test_cases', 'test_times', 'use_gpu']\n", - "result_data.drop(columns_to_remove, axis=1, inplace=True)\n", - "result_data" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "From above result, we can see that latency is very close for different settings. The default setting (intra_op_num_threads=0, OMP_NUM_THREADS and OMP_WAIT_POLICY does not exist) performs the best. \n", - "\n", - "### Model Results Comparison Tool\n", - "\n", - "When a BERT model is optimized, some approximation is used in calculation. If your BERT model has three inputs, a script compare_bert_results.py can be used to do a quick verification. The tool will generate some fake input data, and compare the inference outputs of the original and optimized models. If outputs are all close, it is safe to use the optimized model.\n", - "\n", - "For GPU inference, the absolute or relative difference is larger than those numbers of CPU inference. Note that slight difference in output will not impact final result. We did end-to-end evaluation using SQuAD data set using a fine-tuned squad model, and F1 score is almost the same before/after optimization." - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "100% passed for 100 random inputs given thresholds (rtol=0.01, atol=0.01).\r\n", - "maximum absolute difference=5.316734313964844e-05\r\n", - "maximum relative difference=0.00012461667938623577\r\n" + "100% passed for 100 random inputs given thresholds (rtol=0.01, atol=0.01).\n", + "maximum absolute difference=0.05149984359741211\n" ] } ], "source": [ - "!{sys.executable} -m onnxruntime.transformers.compare_bert_results --baseline_model $export_model_path --optimized_model $optimized_fp32_model_path --batch_size 1 --sequence_length 128 --samples 100 --rtol 0.01 --atol 0.01 $GPU_OPTION" + "USE_GPU = '--use_gpu' if use_gpu else ''\n", + "!{sys.executable} -m onnxruntime.transformers.compare_bert_results --baseline_model $export_model_path --optimized_model $optimized_fp32_model_path --batch_size 1 --sequence_length 128 --samples 100 --rtol 0.01 --atol 0.01 $USE_GPU" ] }, { @@ -916,7 +1088,40 @@ "\n", "The optimizer.py script have an option **--float16** to convert model to use float16 to store weights. After the conversion, it could be faster to run in GPU with tensor cores like V100 or T4.\n", "\n", - "Let's run tools to measure the performance on V100. The results show significant performance improvement: latency is about 3.4 ms for float32 model, and 1.8 ms for float16 model." + "Let's run tools to measure the performance on Nvidia RTX 4090. The results show significant performance improvement: latency is about 3.2 ms for float32 model, and about 1.8 ms for float16 model." + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b\u0000[\u00000\u0000;\u00009\u00003\u0000m\u00002\u00000\u00002\u00003\u0000-\u00000\u00009\u0000-\u00001\u00002\u0000 \u00001\u00002\u0000:\u00005\u00007\u0000:\u00005\u00004\u0000.\u00005\u00005\u00000\u00008\u00002\u00002\u00008\u0000 \u0000[\u0000W\u0000:\u0000o\u0000n\u0000n\u0000x\u0000r\u0000u\u0000n\u0000t\u0000i\u0000m\u0000e\u0000:\u0000,\u0000 \u0000s\u0000e\u0000s\u0000s\u0000i\u0000o\u0000n\u0000_\u0000s\u0000t\u0000a\u0000t\u0000e\u0000.\u0000c\u0000c\u0000:\u00001\u00001\u00006\u00002\u0000 \u0000o\u0000n\u0000n\u0000x\u0000r\u0000u\u0000n\u0000t\u0000i\u0000m\u0000e\u0000:\u0000:\u0000V\u0000e\u0000r\u0000i\u0000f\u0000y\u0000E\u0000a\u0000c\u0000h\u0000N\u0000o\u0000d\u0000e\u0000I\u0000s\u0000A\u0000s\u0000s\u0000i\u0000g\u0000n\u0000e\u0000d\u0000T\u0000o\u0000A\u0000n\u0000E\u0000p\u0000]\u0000 \u0000S\u0000o\u0000m\u0000e\u0000 \u0000n\u0000o\u0000d\u0000e\u0000s\u0000 \u0000w\u0000e\u0000r\u0000e\u0000 \u0000n\u0000o\u0000t\u0000 \u0000a\u0000s\u0000s\u0000i\u0000g\u0000n\u0000e\u0000d\u0000 \u0000t\u0000o\u0000 \u0000t\u0000h\u0000e\u0000 \u0000p\u0000r\u0000e\u0000f\u0000e\u0000r\u0000r\u0000e\u0000d\u0000 \u0000e\u0000x\u0000e\u0000c\u0000u\u0000t\u0000i\u0000o\u0000n\u0000 \u0000p\u0000r\u0000o\u0000v\u0000i\u0000d\u0000e\u0000r\u0000s\u0000 \u0000w\u0000h\u0000i\u0000c\u0000h\u0000 \u0000m\u0000a\u0000y\u0000 \u0000o\u0000r\u0000 \u0000m\u0000a\u0000y\u0000 \u0000n\u0000o\u0000t\u0000 \u0000h\u0000a\u0000v\u0000e\u0000 \u0000a\u0000n\u0000 \u0000n\u0000e\u0000g\u0000a\u0000t\u0000i\u0000v\u0000e\u0000 \u0000i\u0000m\u0000p\u0000a\u0000c\u0000t\u0000 \u0000o\u0000n\u0000 \u0000p\u0000e\u0000r\u0000f\u0000o\u0000r\u0000m\u0000a\u0000n\u0000c\u0000e\u0000.\u0000 \u0000e\u0000.\u0000g\u0000.\u0000 \u0000O\u0000R\u0000T\u0000 \u0000e\u0000x\u0000p\u0000l\u0000i\u0000c\u0000i\u0000t\u0000l\u0000y\u0000 \u0000a\u0000s\u0000s\u0000i\u0000g\u0000n\u0000s\u0000 \u0000s\u0000h\u0000a\u0000p\u0000e\u0000 \u0000r\u0000e\u0000l\u0000a\u0000t\u0000e\u0000d\u0000 \u0000o\u0000p\u0000s\u0000 \u0000t\u0000o\u0000 \u0000C\u0000P\u0000U\u0000 \u0000t\u0000o\u0000 \u0000i\u0000m\u0000p\u0000r\u0000o\u0000v\u0000e\u0000 \u0000p\u0000e\u0000r\u0000f\u0000.\u0000\u001b\u0000[\u0000m\u0000\n", + "\u0000\u001b\u0000[\u00000\u0000;\u00009\u00003\u0000m\u00002\u00000\u00002\u00003\u0000-\u00000\u00009\u0000-\u00001\u00002\u0000 \u00001\u00002\u0000:\u00005\u00007\u0000:\u00005\u00004\u0000.\u00005\u00005\u00001\u00001\u00000\u00000\u00008\u0000 \u0000[\u0000W\u0000:\u0000o\u0000n\u0000n\u0000x\u0000r\u0000u\u0000n\u0000t\u0000i\u0000m\u0000e\u0000:\u0000,\u0000 \u0000s\u0000e\u0000s\u0000s\u0000i\u0000o\u0000n\u0000_\u0000s\u0000t\u0000a\u0000t\u0000e\u0000.\u0000c\u0000c\u0000:\u00001\u00001\u00006\u00004\u0000 \u0000o\u0000n\u0000n\u0000x\u0000r\u0000u\u0000n\u0000t\u0000i\u0000m\u0000e\u0000:\u0000:\u0000V\u0000e\u0000r\u0000i\u0000f\u0000y\u0000E\u0000a\u0000c\u0000h\u0000N\u0000o\u0000d\u0000e\u0000I\u0000s\u0000A\u0000s\u0000s\u0000i\u0000g\u0000n\u0000e\u0000d\u0000T\u0000o\u0000A\u0000n\u0000E\u0000p\u0000]\u0000 \u0000R\u0000e\u0000r\u0000u\u0000n\u0000n\u0000i\u0000n\u0000g\u0000 \u0000w\u0000i\u0000t\u0000h\u0000 \u0000v\u0000e\u0000r\u0000b\u0000o\u0000s\u0000e\u0000 \u0000o\u0000u\u0000t\u0000p\u0000u\u0000t\u0000 \u0000o\u0000n\u0000 \u0000a\u0000 \u0000n\u0000o\u0000n\u0000-\u0000m\u0000i\u0000n\u0000i\u0000m\u0000a\u0000l\u0000 \u0000b\u0000u\u0000i\u0000l\u0000d\u0000 \u0000w\u0000i\u0000l\u0000l\u0000 \u0000s\u0000h\u0000o\u0000w\u0000 \u0000n\u0000o\u0000d\u0000e\u0000 \u0000a\u0000s\u0000s\u0000i\u0000g\u0000n\u0000m\u0000e\u0000n\u0000t\u0000s\u0000.\u0000\u001b\u0000[\u0000m\u0000\n", + "\u0000 apply: Fused LayerNormalization: 49\n", + " apply: Fused Gelu: 24\n", + " apply: Fused SkipLayerNormalization: 48\n", + " apply: Fused Attention: 24\n", + " prune_graph: Removed 5 nodes\n", + " apply: Fused EmbedLayerNormalization(with mask): 1\n", + " prune_graph: Removed 10 nodes\n", + " apply: Fused BiasGelu: 24\n", + " apply: Fused SkipLayerNormalization(add bias): 48\n", + " optimize: opset version: 11\n", + "get_fused_operator_statistics: Optimized operators:{'EmbedLayerNormalization': 1, 'Attention': 24, 'MultiHeadAttention': 0, 'Gelu': 0, 'FastGelu': 0, 'BiasGelu': 24, 'GemmFastGelu': 0, 'LayerNormalization': 0, 'SkipLayerNormalization': 48, 'QOrderedAttention': 0, 'QOrderedGelu': 0, 'QOrderedLayerNormalization': 0, 'QOrderedMatMul': 0}\n", + " main: The model has been fully optimized.\n", + " save_model_to_file: Sort graphs in topological order\n", + " save_model_to_file: Model saved to ./onnx/bert-base-cased-squad_opt_gpu_fp16.onnx\n" + ] + } + ], + "source": [ + "optimized_fp16_model_path = './onnx/bert-base-cased-squad_opt_{}_fp16.onnx'.format('gpu' if use_gpu else 'cpu')\n", + "!{sys.executable} -m onnxruntime.transformers.optimizer --input $export_model_path --output $optimized_fp16_model_path --float16 $USE_GPU" ] }, { @@ -928,28 +1133,49 @@ "name": "stdout", "output_type": "stream", "text": [ - " apply: Fused LayerNormalization count: 49\n", - " apply: Fused Gelu count: 24\n", - "adjust_reshape_and_expand: Removed Reshape and Expand count: 0\n", - " apply: Fused SkipLayerNormalization count: 48\n", - " apply: Fused Attention count: 24\n", - " prune_graph: Graph pruned: 0 inputs, 0 outputs and 5 nodes are removed\n", - " apply: Fused EmbedLayerNormalization(with mask) count: 1\n", - " prune_graph: Graph pruned: 0 inputs, 0 outputs and 3 nodes are removed\n", - " prune_graph: Graph pruned: 0 inputs, 0 outputs and 0 nodes are removed\n", - " apply: Fused BiasGelu count: 24\n", - " apply: Fused SkipLayerNormalization(add bias) count: 48\n", - " optimize: opset version: 11\n", - " save_model_to_file: Sort graphs in topological order\n", - " save_model_to_file: Output model to ./onnx/bert-base-cased-squad_opt_gpu_fp16.onnx\n", - "get_fused_operator_statistics: Optimized operators:{'EmbedLayerNormalization': 1, 'Attention': 24, 'Gelu': 0, 'FastGelu': 0, 'BiasGelu': 24, 'LayerNormalization': 0, 'SkipLayerNormalization': 48}\n", - " main: The model has been fully optimized.\n" + "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=32,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=128,random_sequence_length=False\n", + "Average latency = 1.77 ms, Throughput = 566.45 QPS\n", + "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=24,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=128,random_sequence_length=False\n", + "Average latency = 1.74 ms, Throughput = 574.96 QPS\n", + "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=15,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=128,random_sequence_length=False\n", + "Average latency = 1.74 ms, Throughput = 574.28 QPS\n", + "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=14,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=128,random_sequence_length=False\n", + "Average latency = 1.74 ms, Throughput = 575.17 QPS\n", + "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=13,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=128,random_sequence_length=False\n", + "Average latency = 1.76 ms, Throughput = 569.77 QPS\n", + "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=12,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=128,random_sequence_length=False\n", + "Average latency = 1.79 ms, Throughput = 559.84 QPS\n", + "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=11,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=128,random_sequence_length=False\n", + "Average latency = 1.77 ms, Throughput = 566.09 QPS\n", + "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=10,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=128,random_sequence_length=False\n", + "Average latency = 1.77 ms, Throughput = 563.97 QPS\n", + "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=9,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=128,random_sequence_length=False\n", + "Average latency = 1.77 ms, Throughput = 565.70 QPS\n", + "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=8,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=128,random_sequence_length=False\n", + "Average latency = 1.77 ms, Throughput = 565.50 QPS\n", + "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=7,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=128,random_sequence_length=False\n", + "Average latency = 1.77 ms, Throughput = 566.38 QPS\n", + "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=6,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=128,random_sequence_length=False\n", + "Average latency = 1.75 ms, Throughput = 572.89 QPS\n", + "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=5,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=128,random_sequence_length=False\n", + "Average latency = 1.76 ms, Throughput = 568.67 QPS\n", + "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=4,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=128,random_sequence_length=False\n", + "Average latency = 1.78 ms, Throughput = 561.98 QPS\n", + "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=3,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=128,random_sequence_length=False\n", + "Average latency = 1.77 ms, Throughput = 566.14 QPS\n", + "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=2,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=128,random_sequence_length=False\n", + "Average latency = 1.78 ms, Throughput = 563.25 QPS\n", + "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=128,random_sequence_length=False\n", + "Average latency = 1.77 ms, Throughput = 565.09 QPS\n", + "test setting TestSetting(batch_size=1, sequence_length=128, test_cases=1000, test_times=1, use_gpu=True, use_io_binding=True, provider=None, intra_op_num_threads=None, seed=3, verbose=False, log_severity=2, average_sequence_length=128, random_sequence_length=False)\n", + "Generating 1000 samples for batch_size=1 sequence_length=128\n", + "Test summary is saved to onnx\\perf_results_GPU_B1_S128_20230912-130021.txt\n" ] } ], "source": [ - "optimized_fp16_model_path = './onnx/bert-base-cased-squad_opt_{}_fp16.onnx'.format('gpu' if use_gpu else 'cpu')\n", - "!{sys.executable} -m onnxruntime.transformers.optimizer --input $export_model_path --output $optimized_fp16_model_path --float16" + "GPU_OPTION = '--use_gpu --use_io_binding' if use_gpu else ''\n", + "!python -m onnxruntime.transformers.bert_perf_test --model $optimized_fp16_model_path --batch_size 1 --sequence_length 128 --samples 1000 --test_times 1 $GPU_OPTION" ] }, { @@ -961,35 +1187,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "test setting TestSetting(batch_size=1, sequence_length=128, test_cases=1000, test_times=1, use_gpu=True, intra_op_num_threads=None, seed=3, verbose=False)\n", - "Generating 1000 samples for batch_size=1 sequence_length=128\n", - "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=4,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True\n", - "Average latency = 6.78 ms, Throughput = 147.54 QPS\n", - "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=3,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True\n", - "Average latency = 6.76 ms, Throughput = 147.85 QPS\n", - "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=2,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True\n", - "Average latency = 6.79 ms, Throughput = 147.30 QPS\n", - "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True\n", - "Average latency = 6.81 ms, Throughput = 146.75 QPS\n", - "Test summary is saved to onnx/perf_results_GPU_B1_S128_20210714-002224.txt\n" - ] - } - ], - "source": [ - "GPU_OPTION = '--use_gpu' if use_gpu else ''\n", - "!python -m onnxruntime.transformers.bert_perf_test --model $optimized_fp16_model_path --batch_size 1 --sequence_length 128 --samples 1000 --test_times 1 $GPU_OPTION" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Float32 model perf results from ./onnx/perf_results_GPU_B1_S128_20210714-002224.txt\n" + "Perf results from ./onnx\\perf_results_GPU_B1_S128_20230912-130021.txt\n" ] }, { @@ -1026,82 +1224,243 @@ " \n", " \n", " 0\n", - " 6.76\n", - " 6.79\n", - " 6.81\n", - " 6.90\n", - " 6.91\n", - " 7.00\n", - " 147.85\n", - " 3\n", + " 1.74\n", + " 1.72\n", + " 1.72\n", + " 1.75\n", + " 1.80\n", + " 2.17\n", + " 575.17\n", + " 14\n", " \n", " \n", " 1\n", - " 6.78\n", - " 6.70\n", - " 6.79\n", - " 6.87\n", - " 6.90\n", - " 7.63\n", - " 147.54\n", - " 4\n", + " 1.74\n", + " 1.73\n", + " 1.73\n", + " 1.75\n", + " 1.76\n", + " 2.14\n", + " 574.96\n", + " 24\n", " \n", " \n", " 2\n", - " 6.79\n", - " 6.79\n", - " 6.81\n", - " 6.89\n", - " 6.91\n", - " 7.19\n", - " 147.30\n", - " 2\n", + " 1.74\n", + " 1.72\n", + " 1.73\n", + " 1.76\n", + " 1.79\n", + " 2.16\n", + " 574.28\n", + " 15\n", " \n", " \n", " 3\n", - " 6.81\n", - " 6.80\n", - " 6.89\n", - " 6.91\n", - " 6.97\n", - " 7.20\n", - " 146.75\n", + " 1.75\n", + " 1.72\n", + " 1.72\n", + " 1.76\n", + " 2.02\n", + " 2.15\n", + " 572.89\n", + " 6\n", + " \n", + " \n", + " 4\n", + " 1.76\n", + " 1.74\n", + " 1.74\n", + " 1.76\n", + " 1.81\n", + " 2.14\n", + " 569.77\n", + " 13\n", + " \n", + " \n", + " 5\n", + " 1.76\n", + " 1.72\n", + " 1.73\n", + " 1.80\n", + " 2.08\n", + " 2.15\n", + " 568.67\n", + " 5\n", + " \n", + " \n", + " 6\n", + " 1.77\n", + " 1.73\n", + " 1.74\n", + " 1.81\n", + " 2.12\n", + " 2.19\n", + " 566.45\n", + " 32\n", + " \n", + " \n", + " 7\n", + " 1.77\n", + " 1.74\n", + " 1.74\n", + " 1.77\n", + " 2.06\n", + " 2.17\n", + " 566.38\n", + " 7\n", + " \n", + " \n", + " 8\n", + " 1.77\n", + " 1.73\n", + " 1.74\n", + " 1.81\n", + " 2.10\n", + " 2.18\n", + " 566.14\n", + " 3\n", + " \n", + " \n", + " 9\n", + " 1.77\n", + " 1.73\n", + " 1.74\n", + " 1.82\n", + " 2.07\n", + " 2.17\n", + " 566.09\n", + " 11\n", + " \n", + " \n", + " 10\n", + " 1.77\n", + " 1.74\n", + " 1.75\n", + " 1.78\n", + " 2.02\n", + " 2.13\n", + " 565.70\n", + " 9\n", + " \n", + " \n", + " 11\n", + " 1.77\n", + " 1.73\n", + " 1.74\n", + " 1.93\n", + " 2.06\n", + " 2.16\n", + " 565.50\n", + " 8\n", + " \n", + " \n", + " 12\n", + " 1.77\n", + " 1.73\n", + " 1.74\n", + " 1.81\n", + " 2.11\n", + " 2.20\n", + " 565.09\n", " 1\n", " \n", + " \n", + " 13\n", + " 1.77\n", + " 1.74\n", + " 1.75\n", + " 1.85\n", + " 2.06\n", + " 2.15\n", + " 563.97\n", + " 10\n", + " \n", + " \n", + " 14\n", + " 1.78\n", + " 1.73\n", + " 1.74\n", + " 1.93\n", + " 2.13\n", + " 2.19\n", + " 563.25\n", + " 2\n", + " \n", + " \n", + " 15\n", + " 1.78\n", + " 1.74\n", + " 1.75\n", + " 1.88\n", + " 2.10\n", + " 2.19\n", + " 561.98\n", + " 4\n", + " \n", + " \n", + " 16\n", + " 1.79\n", + " 1.75\n", + " 1.76\n", + " 1.99\n", + " 2.08\n", + " 2.16\n", + " 559.84\n", + " 12\n", + " \n", " \n", "\n", "" ], "text/plain": [ - " Latency(ms) Latency_P50 Latency_P75 Latency_P90 Latency_P95 \\\n", - "0 6.76 6.79 6.81 6.90 6.91 \n", - "1 6.78 6.70 6.79 6.87 6.90 \n", - "2 6.79 6.79 6.81 6.89 6.91 \n", - "3 6.81 6.80 6.89 6.91 6.97 \n", + " Latency(ms) Latency_P50 Latency_P75 Latency_P90 Latency_P95 \\\n", + "0 1.74 1.72 1.72 1.75 1.80 \n", + "1 1.74 1.73 1.73 1.75 1.76 \n", + "2 1.74 1.72 1.73 1.76 1.79 \n", + "3 1.75 1.72 1.72 1.76 2.02 \n", + "4 1.76 1.74 1.74 1.76 1.81 \n", + "5 1.76 1.72 1.73 1.80 2.08 \n", + "6 1.77 1.73 1.74 1.81 2.12 \n", + "7 1.77 1.74 1.74 1.77 2.06 \n", + "8 1.77 1.73 1.74 1.81 2.10 \n", + "9 1.77 1.73 1.74 1.82 2.07 \n", + "10 1.77 1.74 1.75 1.78 2.02 \n", + "11 1.77 1.73 1.74 1.93 2.06 \n", + "12 1.77 1.73 1.74 1.81 2.11 \n", + "13 1.77 1.74 1.75 1.85 2.06 \n", + "14 1.78 1.73 1.74 1.93 2.13 \n", + "15 1.78 1.74 1.75 1.88 2.10 \n", + "16 1.79 1.75 1.76 1.99 2.08 \n", "\n", - " Latency_P99 Throughput(QPS) intra_op_num_threads \n", - "0 7.00 147.85 3 \n", - "1 7.63 147.54 4 \n", - "2 7.19 147.30 2 \n", - "3 7.20 146.75 1 " + " Latency_P99 Throughput(QPS) intra_op_num_threads \n", + "0 2.17 575.17 14 \n", + "1 2.14 574.96 24 \n", + "2 2.16 574.28 15 \n", + "3 2.15 572.89 6 \n", + "4 2.14 569.77 13 \n", + "5 2.15 568.67 5 \n", + "6 2.19 566.45 32 \n", + "7 2.17 566.38 7 \n", + "8 2.18 566.14 3 \n", + "9 2.17 566.09 11 \n", + "10 2.13 565.70 9 \n", + "11 2.16 565.50 8 \n", + "12 2.20 565.09 1 \n", + "13 2.15 563.97 10 \n", + "14 2.19 563.25 2 \n", + "15 2.19 561.98 4 \n", + "16 2.16 559.84 12 " ] }, - "execution_count": 22, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "import os\n", - "import glob \n", - "import pandas\n", - "latest_result_file = max(glob.glob(\"./onnx/perf_results_GPU_B1_S128_*.txt\"), key=os.path.getmtime)\n", - "result_data = pandas.read_table(latest_result_file)\n", - "print(\"Float32 model perf results from\", latest_result_file)\n", - "# Remove some columns that have same values for all rows.\n", - "columns_to_remove = ['model', 'graph_optimization_level', 'batch_size', 'sequence_length', 'test_cases', 'test_times', 'use_gpu']\n", - "result_data.drop(columns_to_remove, axis=1, inplace=True)\n", - "result_data" + "fp32_result = load_last_perf_test_result()\n", + "fp32_result" ] }, { @@ -1115,6 +1474,48 @@ "Here is an example that check the performance of multiple batch sizes (1, 2, 4, 8, 16, 32 and 64) using default settings." ] }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=8,batch_size=32,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=128,random_sequence_length=False\n", + "Average latency = 20.41 ms, Throughput = 1567.65 QPS\n", + "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=8,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=128,random_sequence_length=False\n", + "Average latency = 1.73 ms, Throughput = 576.74 QPS\n", + "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=8,batch_size=2,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=128,random_sequence_length=False\n", + "Average latency = 2.18 ms, Throughput = 917.92 QPS\n", + "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=8,batch_size=4,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=128,random_sequence_length=False\n", + "Average latency = 3.25 ms, Throughput = 1229.91 QPS\n", + "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=8,batch_size=8,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=128,random_sequence_length=False\n", + "Average latency = 5.38 ms, Throughput = 1486.89 QPS\n", + "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=8,batch_size=16,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=128,random_sequence_length=False\n", + "Average latency = 9.90 ms, Throughput = 1616.79 QPS\n", + "test setting TestSetting(batch_size=32, sequence_length=128, test_cases=1000, test_times=1, use_gpu=True, use_io_binding=True, provider=None, intra_op_num_threads=8, seed=3, verbose=False, log_severity=2, average_sequence_length=128, random_sequence_length=False)\n", + "Generating 1000 samples for batch_size=32 sequence_length=128\n", + "test setting TestSetting(batch_size=1, sequence_length=128, test_cases=1000, test_times=1, use_gpu=True, use_io_binding=True, provider=None, intra_op_num_threads=8, seed=3, verbose=False, log_severity=2, average_sequence_length=128, random_sequence_length=False)\n", + "Generating 1000 samples for batch_size=1 sequence_length=128\n", + "test setting TestSetting(batch_size=2, sequence_length=128, test_cases=1000, test_times=1, use_gpu=True, use_io_binding=True, provider=None, intra_op_num_threads=8, seed=3, verbose=False, log_severity=2, average_sequence_length=128, random_sequence_length=False)\n", + "Generating 1000 samples for batch_size=2 sequence_length=128\n", + "test setting TestSetting(batch_size=4, sequence_length=128, test_cases=1000, test_times=1, use_gpu=True, use_io_binding=True, provider=None, intra_op_num_threads=8, seed=3, verbose=False, log_severity=2, average_sequence_length=128, random_sequence_length=False)\n", + "Generating 1000 samples for batch_size=4 sequence_length=128\n", + "test setting TestSetting(batch_size=8, sequence_length=128, test_cases=1000, test_times=1, use_gpu=True, use_io_binding=True, provider=None, intra_op_num_threads=8, seed=3, verbose=False, log_severity=2, average_sequence_length=128, random_sequence_length=False)\n", + "Generating 1000 samples for batch_size=8 sequence_length=128\n", + "test setting TestSetting(batch_size=16, sequence_length=128, test_cases=1000, test_times=1, use_gpu=True, use_io_binding=True, provider=None, intra_op_num_threads=8, seed=3, verbose=False, log_severity=2, average_sequence_length=128, random_sequence_length=False)\n", + "Generating 1000 samples for batch_size=16 sequence_length=128\n", + "Test summary is saved to onnx\\perf_results_GPU_B1-2-4-8-16-32_S128_20230912-130248.txt\n" + ] + } + ], + "source": [ + "THREAD_SETTING = '--intra_op_num_threads 8'\n", + "!{sys.executable} -m onnxruntime.transformers.bert_perf_test --model $optimized_fp16_model_path --batch_size 1 2 4 8 16 32 --sequence_length 128 --samples 1000 --test_times 1 $THREAD_SETTING $GPU_OPTION" + ] + }, { "cell_type": "code", "execution_count": 23, @@ -1124,52 +1525,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "test setting TestSetting(batch_size=32, sequence_length=128, test_cases=1000, test_times=1, use_gpu=True, intra_op_num_threads=3, seed=3, verbose=False)\n", - "Generating 1000 samples for batch_size=32 sequence_length=128\n", - "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=3,batch_size=32,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True\n", - "Average latency = 168.40 ms, Throughput = 190.02 QPS\n", - "test setting TestSetting(batch_size=1, sequence_length=128, test_cases=1000, test_times=1, use_gpu=True, intra_op_num_threads=3, seed=3, verbose=False)\n", - "Generating 1000 samples for batch_size=1 sequence_length=128\n", - "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=3,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True\n", - "Average latency = 7.14 ms, Throughput = 140.00 QPS\n", - "test setting TestSetting(batch_size=2, sequence_length=128, test_cases=1000, test_times=1, use_gpu=True, intra_op_num_threads=3, seed=3, verbose=False)\n", - "Generating 1000 samples for batch_size=2 sequence_length=128\n", - "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=3,batch_size=2,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True\n", - "Average latency = 11.27 ms, Throughput = 177.41 QPS\n", - "test setting TestSetting(batch_size=4, sequence_length=128, test_cases=1000, test_times=1, use_gpu=True, intra_op_num_threads=3, seed=3, verbose=False)\n", - "Generating 1000 samples for batch_size=4 sequence_length=128\n", - "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=3,batch_size=4,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True\n", - "Average latency = 21.15 ms, Throughput = 189.09 QPS\n", - "test setting TestSetting(batch_size=8, sequence_length=128, test_cases=1000, test_times=1, use_gpu=True, intra_op_num_threads=3, seed=3, verbose=False)\n", - "Generating 1000 samples for batch_size=8 sequence_length=128\n", - "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=3,batch_size=8,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True\n", - "Average latency = 42.27 ms, Throughput = 189.27 QPS\n", - "test setting TestSetting(batch_size=16, sequence_length=128, test_cases=1000, test_times=1, use_gpu=True, intra_op_num_threads=3, seed=3, verbose=False)\n", - "Generating 1000 samples for batch_size=16 sequence_length=128\n", - "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=3,batch_size=16,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True\n", - "Average latency = 83.77 ms, Throughput = 191.01 QPS\n", - "Test summary is saved to onnx/perf_results_GPU_B1-2-4-8-16-32_S128_20210714-002816.txt\n" - ] - } - ], - "source": [ - "GPU_OPTION = '--use_gpu' if use_gpu else ''\n", - "THREAD_SETTING = '--intra_op_num_threads 3'\n", - "!{sys.executable} -m onnxruntime.transformers.bert_perf_test --model $optimized_fp16_model_path --batch_size 1 2 4 8 16 32 --sequence_length 128 --samples 1000 --test_times 1 $THREAD_SETTING $GPU_OPTION" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": { - "scrolled": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Float16 model summary from ./onnx/perf_results_GPU_B1-2-4-8-16-32_S128_20210714-002816.txt\n" + "Perf results from ./onnx\\perf_results_GPU_B1-2-4-8-16-32_S128_20230912-130248.txt\n" ] }, { @@ -1200,75 +1556,75 @@ " Latency_P95\n", " Latency_P99\n", " Throughput(QPS)\n", - " batch_size\n", + " intra_op_num_threads\n", " \n", " \n", " \n", " \n", " 0\n", - " 7.14\n", - " 7.10\n", - " 7.13\n", - " 7.25\n", - " 7.35\n", - " 10.99\n", - " 140.00\n", - " 1\n", + " 1.73\n", + " 1.72\n", + " 1.73\n", + " 1.73\n", + " 1.79\n", + " 2.04\n", + " 576.74\n", + " 8\n", " \n", " \n", " 1\n", - " 11.27\n", - " 11.23\n", - " 11.28\n", - " 11.53\n", - " 11.57\n", - " 12.05\n", - " 177.41\n", - " 2\n", + " 2.18\n", + " 2.16\n", + " 2.16\n", + " 2.18\n", + " 2.29\n", + " 2.76\n", + " 917.92\n", + " 8\n", " \n", " \n", " 2\n", - " 21.15\n", - " 21.13\n", - " 21.25\n", - " 21.44\n", - " 21.59\n", - " 22.07\n", - " 189.09\n", - " 4\n", + " 3.25\n", + " 3.25\n", + " 3.26\n", + " 3.28\n", + " 3.29\n", + " 3.43\n", + " 1229.91\n", + " 8\n", " \n", " \n", " 3\n", - " 42.27\n", - " 42.26\n", - " 42.68\n", - " 42.95\n", - " 43.11\n", - " 45.11\n", - " 189.27\n", + " 5.38\n", + " 5.38\n", + " 5.39\n", + " 5.42\n", + " 5.44\n", + " 5.60\n", + " 1486.89\n", " 8\n", " \n", " \n", " 4\n", - " 83.77\n", - " 83.84\n", - " 84.29\n", - " 84.94\n", - " 85.35\n", - " 86.34\n", - " 191.01\n", - " 16\n", + " 9.90\n", + " 9.89\n", + " 9.94\n", + " 9.97\n", + " 10.00\n", + " 10.06\n", + " 1616.79\n", + " 8\n", " \n", " \n", " 5\n", - " 168.40\n", - " 169.62\n", - " 170.78\n", - " 171.94\n", - " 172.82\n", - " 174.28\n", - " 190.02\n", - " 32\n", + " 20.41\n", + " 20.41\n", + " 20.47\n", + " 20.52\n", + " 20.55\n", + " 20.68\n", + " 1567.65\n", + " 8\n", " \n", " \n", "\n", @@ -1276,38 +1632,239 @@ ], "text/plain": [ " Latency(ms) Latency_P50 Latency_P75 Latency_P90 Latency_P95 \\\n", - "0 7.14 7.10 7.13 7.25 7.35 \n", - "1 11.27 11.23 11.28 11.53 11.57 \n", - "2 21.15 21.13 21.25 21.44 21.59 \n", - "3 42.27 42.26 42.68 42.95 43.11 \n", - "4 83.77 83.84 84.29 84.94 85.35 \n", - "5 168.40 169.62 170.78 171.94 172.82 \n", + "0 1.73 1.72 1.73 1.73 1.79 \n", + "1 2.18 2.16 2.16 2.18 2.29 \n", + "2 3.25 3.25 3.26 3.28 3.29 \n", + "3 5.38 5.38 5.39 5.42 5.44 \n", + "4 9.90 9.89 9.94 9.97 10.00 \n", + "5 20.41 20.41 20.47 20.52 20.55 \n", "\n", - " Latency_P99 Throughput(QPS) batch_size \n", - "0 10.99 140.00 1 \n", - "1 12.05 177.41 2 \n", - "2 22.07 189.09 4 \n", - "3 45.11 189.27 8 \n", - "4 86.34 191.01 16 \n", - "5 174.28 190.02 32 " + " Latency_P99 Throughput(QPS) intra_op_num_threads \n", + "0 2.04 576.74 8 \n", + "1 2.76 917.92 8 \n", + "2 3.43 1229.91 8 \n", + "3 5.60 1486.89 8 \n", + "4 10.06 1616.79 8 \n", + "5 20.68 1567.65 8 " ] }, - "execution_count": 24, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "import os\n", - "import glob \n", - "import pandas\n", - "latest_result_file = max(glob.glob(\"./onnx/perf_results_*.txt\"), key=os.path.getmtime)\n", - "result_data = pandas.read_table(latest_result_file)\n", - "print(\"Float16 model summary from\", latest_result_file)\n", - "columns_to_remove = ['model', 'graph_optimization_level', 'test_cases', 'test_times', 'use_gpu', 'sequence_length']\n", - "columns_to_remove.extend(['intra_op_num_threads'])\n", - "result_data.drop(columns_to_remove, axis=1, inplace=True)\n", - "result_data" + "fp16_result = load_last_perf_test_result()\n", + "fp16_result" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Packing Mode (Effective Transformer)\n", + "\n", + "When padding ratio is high, it is helpful to use packing mode, also known as [effective transformer](https://github.com/bytedance/effective_transformer).\n", + "This feature requires onnxruntime-gpu verison 1.16 or later. \n", + "\n", + "In below example, average sequence length after removing paddings is 32, the sequence length with paddings is 128. We can see 3x throughput with packing mode (QPS increased from 1617 to 5652)." + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "_replace_attention_with_packing_attention: Converted 24 Attention nodes to PackedAttention.\n", + " save_model_to_file: Sort graphs in topological order\n", + " save: Delete the existing onnx file: ./onnx/bert-base-cased-squad_opt_gpu_fp16_packed.onnx\n", + " save: Delete the existing external data file: ./onnx/bert-base-cased-squad_opt_gpu_fp16_packed.onnx.data\n", + " save_model_to_file: Model saved to ./onnx/bert-base-cased-squad_opt_gpu_fp16_packed.onnx\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Running test: model=bert-base-cased-squad_opt_gpu_fp16_packed.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=8,batch_size=32,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=32,random_sequence_length=False\n", + "Average latency = 5.66 ms, Throughput = 5652.40 QPS\n", + "Running test: model=bert-base-cased-squad_opt_gpu_fp16_packed.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=8,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=32,random_sequence_length=False\n", + "Average latency = 1.70 ms, Throughput = 586.97 QPS\n", + "Running test: model=bert-base-cased-squad_opt_gpu_fp16_packed.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=8,batch_size=2,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=32,random_sequence_length=False\n", + "Average latency = 1.79 ms, Throughput = 1114.37 QPS\n", + "Running test: model=bert-base-cased-squad_opt_gpu_fp16_packed.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=8,batch_size=4,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=32,random_sequence_length=False\n", + "Average latency = 1.77 ms, Throughput = 2262.31 QPS\n", + "Running test: model=bert-base-cased-squad_opt_gpu_fp16_packed.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=8,batch_size=8,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=32,random_sequence_length=False\n", + "Average latency = 2.18 ms, Throughput = 3666.45 QPS\n", + "Running test: model=bert-base-cased-squad_opt_gpu_fp16_packed.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=8,batch_size=16,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=32,random_sequence_length=False\n", + "Average latency = 3.31 ms, Throughput = 4829.58 QPS\n", + "test setting TestSetting(batch_size=32, sequence_length=128, test_cases=1000, test_times=1, use_gpu=True, use_io_binding=True, provider=None, intra_op_num_threads=8, seed=3, verbose=False, log_severity=2, average_sequence_length=32, random_sequence_length=False)\n", + "Generating 1000 samples for batch_size=32 sequence_length=128\n", + "test setting TestSetting(batch_size=1, sequence_length=128, test_cases=1000, test_times=1, use_gpu=True, use_io_binding=True, provider=None, intra_op_num_threads=8, seed=3, verbose=False, log_severity=2, average_sequence_length=32, random_sequence_length=False)\n", + "Generating 1000 samples for batch_size=1 sequence_length=128\n", + "test setting TestSetting(batch_size=2, sequence_length=128, test_cases=1000, test_times=1, use_gpu=True, use_io_binding=True, provider=None, intra_op_num_threads=8, seed=3, verbose=False, log_severity=2, average_sequence_length=32, random_sequence_length=False)\n", + "Generating 1000 samples for batch_size=2 sequence_length=128\n", + "test setting TestSetting(batch_size=4, sequence_length=128, test_cases=1000, test_times=1, use_gpu=True, use_io_binding=True, provider=None, intra_op_num_threads=8, seed=3, verbose=False, log_severity=2, average_sequence_length=32, random_sequence_length=False)\n", + "Generating 1000 samples for batch_size=4 sequence_length=128\n", + "test setting TestSetting(batch_size=8, sequence_length=128, test_cases=1000, test_times=1, use_gpu=True, use_io_binding=True, provider=None, intra_op_num_threads=8, seed=3, verbose=False, log_severity=2, average_sequence_length=32, random_sequence_length=False)\n", + "Generating 1000 samples for batch_size=8 sequence_length=128\n", + "test setting TestSetting(batch_size=16, sequence_length=128, test_cases=1000, test_times=1, use_gpu=True, use_io_binding=True, provider=None, intra_op_num_threads=8, seed=3, verbose=False, log_severity=2, average_sequence_length=32, random_sequence_length=False)\n", + "Generating 1000 samples for batch_size=16 sequence_length=128\n", + "Test summary is saved to onnx\\perf_results_GPU_B1-2-4-8-16-32_S128_20230912-130354.txt\n" + ] + } + ], + "source": [ + "assert use_gpu, \"Require GPU for packing mode\"\n", + "packed_fp16_model_path = './onnx/bert-base-cased-squad_opt_gpu_fp16_packed.onnx'\n", + "!{sys.executable} -m onnxruntime.transformers.convert_to_packing_mode --input $optimized_fp16_model_path --output $packed_fp16_model_path --use_external_data_format\n", + "!{sys.executable} -m onnxruntime.transformers.bert_perf_test --model $packed_fp16_model_path --batch_size 1 2 4 8 16 32 --sequence_length 128 --average_sequence_length 32 --samples 1000 --test_times 1 $THREAD_SETTING $GPU_OPTION " + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Perf results from ./onnx\\perf_results_GPU_B1-2-4-8-16-32_S128_20230912-130354.txt\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Latency(ms)Latency_P50Latency_P75Latency_P90Latency_P95Latency_P99Throughput(QPS)intra_op_num_threads
01.701.631.652.132.202.32586.978
11.771.741.761.821.932.172262.318
21.791.731.742.122.182.321114.378
32.182.162.172.222.302.643666.458
43.313.313.323.353.393.514829.588
55.665.665.685.715.745.915652.408
\n", + "
" + ], + "text/plain": [ + " Latency(ms) Latency_P50 Latency_P75 Latency_P90 Latency_P95 \\\n", + "0 1.70 1.63 1.65 2.13 2.20 \n", + "1 1.77 1.74 1.76 1.82 1.93 \n", + "2 1.79 1.73 1.74 2.12 2.18 \n", + "3 2.18 2.16 2.17 2.22 2.30 \n", + "4 3.31 3.31 3.32 3.35 3.39 \n", + "5 5.66 5.66 5.68 5.71 5.74 \n", + "\n", + " Latency_P99 Throughput(QPS) intra_op_num_threads \n", + "0 2.32 586.97 8 \n", + "1 2.17 2262.31 8 \n", + "2 2.32 1114.37 8 \n", + "3 2.64 3666.45 8 \n", + "4 3.51 4829.58 8 \n", + "5 5.91 5652.40 8 " + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "packing_result = load_last_perf_test_result()\n", + "packing_result" ] }, { @@ -1327,7 +1884,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 26, "metadata": { "scrolled": true }, @@ -1336,131 +1893,53 @@ "name": "stdout", "output_type": "stream", "text": [ - "{\r\n", - " \"gpu\": {\r\n", - " \"driver_version\": \"450.51.05\",\r\n", - " \"devices\": [\r\n", - " {\r\n", - " \"memory_total\": 15843721216,\r\n", - " \"memory_available\": 9313189888,\r\n", - " \"name\": \"Tesla T4\"\r\n", - " }\r\n", - " ]\r\n", - " },\r\n", - " \"cpu\": {\r\n", - " \"brand\": \"AMD EPYC 7V12 64-Core Processor\",\r\n", - " \"cores\": 4,\r\n", - " \"logical_cores\": 4,\r\n", - " \"hz\": [\r\n", - " 2445417000,\r\n", - " 0\r\n", - " ],\r\n", - " \"l2_cache\": 524288,\r\n", - " \"flags\": [\r\n", - " \"3dnowext\",\r\n", - " \"3dnowprefetch\",\r\n", - " \"abm\",\r\n", - " \"adx\",\r\n", - " \"aes\",\r\n", - " \"apic\",\r\n", - " \"arat\",\r\n", - " \"avx\",\r\n", - " \"avx2\",\r\n", - " \"bmi1\",\r\n", - " \"bmi2\",\r\n", - " \"clflush\",\r\n", - " \"clflushopt\",\r\n", - " \"clwb\",\r\n", - " \"cmov\",\r\n", - " \"cmp_legacy\",\r\n", - " \"cpuid\",\r\n", - " \"cr8_legacy\",\r\n", - " \"cx16\",\r\n", - " \"cx8\",\r\n", - " \"de\",\r\n", - " \"extd_apicid\",\r\n", - " \"f16c\",\r\n", - " \"fma\",\r\n", - " \"fpu\",\r\n", - " \"fsgsbase\",\r\n", - " \"fxsr\",\r\n", - " \"fxsr_opt\",\r\n", - " \"ht\",\r\n", - " \"hypervisor\",\r\n", - " \"lahf_lm\",\r\n", - " \"lm\",\r\n", - " \"mca\",\r\n", - " \"mce\",\r\n", - " \"misalignsse\",\r\n", - " \"mmx\",\r\n", - " \"mmxext\",\r\n", - " \"movbe\",\r\n", - " \"msr\",\r\n", - " \"mtrr\",\r\n", - " \"nopl\",\r\n", - " \"nx\",\r\n", - " \"osvw\",\r\n", - " \"osxsave\",\r\n", - " \"pae\",\r\n", - " \"pat\",\r\n", - " \"pclmulqdq\",\r\n", - " \"pdpe1gb\",\r\n", - " \"pge\",\r\n", - " \"pni\",\r\n", - " \"popcnt\",\r\n", - " \"pse\",\r\n", - " \"pse36\",\r\n", - " \"rdpid\",\r\n", - " \"rdrand\",\r\n", - " \"rdrnd\",\r\n", - " \"rdseed\",\r\n", - " \"rdtscp\",\r\n", - " \"rep_good\",\r\n", - " \"sep\",\r\n", - " \"sha\",\r\n", - " \"sha_ni\",\r\n", - " \"smap\",\r\n", - " \"smep\",\r\n", - " \"ssbd\",\r\n", - " \"sse\",\r\n", - " \"sse2\",\r\n", - " \"sse4_1\",\r\n", - " \"sse4_2\",\r\n", - " \"sse4a\",\r\n", - " \"ssse3\",\r\n", - " \"syscall\",\r\n", - " \"topoext\",\r\n", - " \"tsc\",\r\n", - " \"umip\",\r\n", - " \"vme\",\r\n", - " \"vmmcall\",\r\n", - " \"xgetbv1\",\r\n", - " \"xsave\",\r\n", - " \"xsavec\",\r\n", - " \"xsaveerptr\",\r\n", - " \"xsaveopt\",\r\n", - " \"xsaves\"\r\n", - " ],\r\n", - " \"processor\": \"x86_64\"\r\n", - " },\r\n", - " \"memory\": {\r\n", - " \"total\": 29450223616,\r\n", - " \"available\": 22402334720\r\n", - " },\r\n", - " \"python\": \"3.6.13.final.0 (64 bit)\",\r\n", - " \"os\": \"Linux-5.4.0-1046-azure-x86_64-with-debian-buster-sid\",\r\n", - " \"onnxruntime\": {\r\n", - " \"version\": \"1.8.1\",\r\n", - " \"support_gpu\": true\r\n", - " },\r\n", - " \"onnxruntime_tools\": null,\r\n", - " \"pytorch\": {\r\n", - " \"version\": \"1.9.0+cu111\",\r\n", - " \"support_gpu\": true,\r\n", - " \"cuda\": \"11.1\"\r\n", - " },\r\n", - " \"tensorflow\": null\r\n", - "}\r\n" + "{\n", + " \"gpu\": {\n", + " \"driver_version\": \"537.13\",\n", + " \"devices\": [\n", + " {\n", + " \"memory_total\": 25757220864,\n", + " \"memory_available\": 18009264128,\n", + " \"name\": \"NVIDIA GeForce RTX 4090\"\n", + " }\n", + " ]\n", + " },\n", + " \"cpu\": {\n", + " \"brand\": \"13th Gen Intel(R) Core(TM) i9-13900\",\n", + " \"cores\": 24,\n", + " \"logical_cores\": 32,\n", + " \"hz\": \"2000000000,0\",\n", + " \"l2_cache\": 33554432,\n", + " \"flags\": \"3dnow,3dnowprefetch,abm,acpi,adx,aes,apic,avx,avx2,bmi1,bmi2,clflush,clflushopt,clwb,cmov,cx16,cx8,de,dts,erms,est,f16c,fma,fpu,fxsr,gfni,ht,hypervisor,ia64,intel_pt,invpcid,lahf_lm,mca,mce,mmx,monitor,movbe,msr,mtrr,osxsave,pae,pat,pbe,pcid,pclmulqdq,pdcm,pge,pni,popcnt,pse,pse36,rdpid,rdrnd,rdseed,sep,serial,sha,smap,smep,ss,sse,sse2,sse4_1,sse4_2,ssse3,tm,tm2,tsc,tscdeadline,umip,vaes,vme,vpclmulqdq,x2apic,xsave,xtpr\",\n", + " \"processor\": \"Intel64 Family 6 Model 183 Stepping 1, GenuineIntel\"\n", + " },\n", + " \"memory\": {\n", + " \"total\": 33992912896,\n", + " \"available\": 17272422400\n", + " },\n", + " \"os\": \"Windows-10-10.0.22621-SP0\",\n", + " \"python\": \"3.10.13.final.0 (64 bit)\",\n", + " \"packages\": {\n", + " \"flatbuffers\": \"23.5.26\",\n", + " \"numpy\": \"1.25.2\",\n", + " \"onnx\": \"1.14.1\",\n", + " \"onnxruntime-gpu\": \"1.16.0\",\n", + " \"protobuf\": \"3.20.3\",\n", + " \"sympy\": \"1.12\",\n", + " \"torch\": \"2.0.1+cu118\",\n", + " \"transformers\": \"4.33.1\"\n", + " },\n", + " \"onnxruntime\": {\n", + " \"version\": \"1.16.0\",\n", + " \"support_gpu\": true\n", + " },\n", + " \"pytorch\": {\n", + " \"version\": \"2.0.1+cu118\",\n", + " \"support_gpu\": true,\n", + " \"cuda\": \"11.8\"\n", + " },\n", + " \"tensorflow\": null\n", + "}\n" ] } ], @@ -1485,9 +1964,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.13" + "version": "3.10.13" } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 }