mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-05-23 22:13:38 +00:00
Update notebook: disable GPU for tensorflow (#4649)
This commit is contained in:
parent
623dd53eb7
commit
326cc686df
1 changed files with 80 additions and 76 deletions
|
|
@ -54,11 +54,11 @@
|
|||
"source": [
|
||||
"import sys\n",
|
||||
" \n",
|
||||
"!{sys.executable} -m pip install --quiet --upgrade tensorflow==2.2.0\n",
|
||||
"!{sys.executable} -m pip install --quiet --upgrade onnxruntime\n",
|
||||
"!{sys.executable} -m pip install --quiet --upgrade onnxruntime-tools\n",
|
||||
"!{sys.executable} -m pip install --quiet --upgrade keras2onnx\n",
|
||||
"!{sys.executable} -m pip install --quiet transformers==2.11.0\n",
|
||||
"!{sys.executable} -m pip install --quiet --upgrade tensorflow==2.3.0\n",
|
||||
"!{sys.executable} -m pip install --quiet --upgrade onnxruntime==1.4.0\n",
|
||||
"!{sys.executable} -m pip install --quiet --upgrade onnxruntime-tools==1.4.0\n",
|
||||
"!{sys.executable} -m pip install --quiet --upgrade keras2onnx==1.7.0\n",
|
||||
"!{sys.executable} -m pip install --quiet transformers==3.0.2\n",
|
||||
"!{sys.executable} -m pip install --quiet wget pandas"
|
||||
]
|
||||
},
|
||||
|
|
@ -92,13 +92,23 @@
|
|||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"cache_dir = './cached_models'\n",
|
||||
"cache_dir = './cache_models'\n",
|
||||
"output_dir = './onnx_models'\n",
|
||||
"for directory in [cache_dir, output_dir]:\n",
|
||||
" if not os.path.exists(directory):\n",
|
||||
" os.makedirs(directory)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import tensorflow as tf\n",
|
||||
"tf.config.set_visible_devices([], 'GPU') # Disable GPU for fair comparison"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
|
|
@ -115,16 +125,29 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"execution_count": 5,
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Some weights of the model checkpoint at bert-base-cased were not used when initializing TFBertForQuestionAnswering: ['nsp___cls', 'mlm___cls']\n",
|
||||
"- This IS expected if you are initializing TFBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).\n",
|
||||
"- This IS NOT expected if you are initializing TFBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
|
||||
"Some weights of TFBertForQuestionAnswering were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['qa_outputs']\n",
|
||||
"You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from transformers import (TFBertForQuestionAnswering, BertTokenizer)\n",
|
||||
"\n",
|
||||
"#model_name_or_path = 'bert-large-uncased-whole-word-masking-finetuned-squad'\n",
|
||||
"model_name_or_path = \"bert-base-cased\"\n",
|
||||
"is_fine_tuned = (model_name_or_path == 'bert-large-uncased-whole-word-masking-finetuned-squad')\n",
|
||||
"\n",
|
||||
"# Load model and tokenizer\n",
|
||||
"tokenizer = BertTokenizer.from_pretrained(model_name_or_path, do_lower_case=True, cache_dir=cache_dir)\n",
|
||||
|
|
@ -144,42 +167,34 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"The answer is: [CLS] what is on ##nx run ##time ? [SEP] on ##nx run ##time is a performance - focused in ##ference engine for on ##nx models\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import tensorflow as tf\n",
|
||||
"import numpy\n",
|
||||
"\n",
|
||||
"question, text = \"What is ONNX Runtime?\", \"ONNX Runtime is a performance-focused inference engine for ONNX models.\"\n",
|
||||
"# Pad to max length is needed. Otherwise, position embedding might be truncated by constant folding.\n",
|
||||
"inputs = tokenizer.encode_plus(question, text, add_special_tokens=True, return_tensors='tf',\n",
|
||||
" max_length=max_sequence_length, pad_to_max_length=True)\n",
|
||||
" max_length=max_sequence_length, pad_to_max_length=True, truncation=True)\n",
|
||||
"start_scores, end_scores = model(inputs)\n",
|
||||
"\n",
|
||||
"num_tokens = len(inputs[\"input_ids\"][0])\n",
|
||||
"all_tokens = tokenizer.convert_ids_to_tokens(inputs[\"input_ids\"][0])\n",
|
||||
"print(\"The answer is:\", ' '.join(all_tokens[numpy.argmax(start_scores) : numpy.argmax(end_scores)+1]))"
|
||||
"if is_fine_tuned:\n",
|
||||
" all_tokens = tokenizer.convert_ids_to_tokens(inputs[\"input_ids\"][0])\n",
|
||||
" print(\"The answer is:\", ' '.join(all_tokens[numpy.argmax(start_scores) : numpy.argmax(end_scores)+1]))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Tensorflow Inference time for sequence length 512 = 94.62 ms\n"
|
||||
"Tensorflow Inference time for sequence length 512 = 1133.13 ms\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
|
|
@ -203,7 +218,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
|
@ -239,7 +254,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
|
@ -260,14 +275,14 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"ONNX Runtime cpu inference time for sequence length 512 (model not optimized): 630.54 ms\n"
|
||||
"ONNX Runtime cpu inference time for sequence length 512 (model not optimized): 654.49 ms\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
|
|
@ -280,7 +295,7 @@
|
|||
"\n",
|
||||
"# intra_op_num_threads=1 can be used to enable OpenMP in OnnxRuntime 1.2.0.\n",
|
||||
"# For OnnxRuntime 1.3.0 or later, this does not have effect unless you are using onnxruntime-gpu package.\n",
|
||||
"sess_options.intra_op_num_threads=1\n",
|
||||
"# sess_options.intra_op_num_threads=1\n",
|
||||
"\n",
|
||||
"# Providers is optional. Only needed when you use onnxruntime-gpu for CPU inference.\n",
|
||||
"session = onnxruntime.InferenceSession(output_model_path, sess_options, providers=['CPUExecutionProvider'])\n",
|
||||
|
|
@ -302,26 +317,15 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"***** Verifying correctness (TensorFlow and ONNX Runtime) *****\n",
|
||||
"WARNING:tensorflow:From <ipython-input-10-453158d8869f>:2: _EagerTensorBase.cpu (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.\n",
|
||||
"Instructions for updating:\n",
|
||||
"Use tf.identity instead.\n",
|
||||
"start_scores are close: False\n",
|
||||
"end_scores are close: False\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(\"***** Verifying correctness (TensorFlow and ONNX Runtime) *****\")\n",
|
||||
"print('start_scores are close:', numpy.allclose(results[0], start_scores.cpu(), rtol=1e-05, atol=1e-04))\n",
|
||||
"print('end_scores are close:', numpy.allclose(results[1], end_scores.cpu(), rtol=1e-05, atol=1e-04))"
|
||||
"# Some weights of TFBertForQuestionAnswering might not be initialized without fine-tuning.\n",
|
||||
"if is_fine_tuned:\n",
|
||||
" print(\"***** Verifying correctness (TensorFlow and ONNX Runtime) *****\")\n",
|
||||
" print('start_scores are close:', numpy.allclose(results[0], start_scores.cpu(), rtol=1e-05, atol=1e-04))\n",
|
||||
" print('end_scores are close:', numpy.allclose(results[1], end_scores.cpu(), rtol=1e-05, atol=1e-04))"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
|
@ -346,7 +350,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
|
@ -367,14 +371,14 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"execution_count": 13,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"ONNX Runtime cpu inference time on optimized model: 369.18 ms\n"
|
||||
"ONNX Runtime cpu inference time on optimized model: 328.48 ms\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
|
|
@ -394,7 +398,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"execution_count": 14,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
|
|
@ -402,15 +406,15 @@
|
|||
"output_type": "stream",
|
||||
"text": [
|
||||
"***** Verifying correctness (before and after optimization) *****\n",
|
||||
"start_scores are close: False\n",
|
||||
"end_scores are close: False\n"
|
||||
"start_scores are close: True\n",
|
||||
"end_scores are close: True\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(\"***** Verifying correctness (before and after optimization) *****\")\n",
|
||||
"print('start_scores are close:', numpy.allclose(opt_results[0], start_scores.cpu(), rtol=1e-05, atol=1e-04))\n",
|
||||
"print('end_scores are close:', numpy.allclose(opt_results[1], end_scores.cpu(), rtol=1e-05, atol=1e-04))"
|
||||
"print('start_scores are close:', numpy.allclose(opt_results[0], results[0], rtol=1e-05, atol=1e-04))\n",
|
||||
"print('end_scores are close:', numpy.allclose(opt_results[1], results[1], rtol=1e-05, atol=1e-04))"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
|
@ -426,7 +430,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"execution_count": 15,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
|
|
@ -434,8 +438,8 @@
|
|||
"output_type": "stream",
|
||||
"text": [
|
||||
"100% passed for 10 random inputs given thresholds (rtol=0.001, atol=0.0001).\n",
|
||||
"maximum absolute difference=1.2461096048355103e-06\n",
|
||||
"maximum relative difference=0.006510902661830187\n"
|
||||
"maximum absolute difference=1.6242265701293945e-06\n",
|
||||
"maximum relative difference=0.009154098108410835\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
|
|
@ -457,7 +461,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"execution_count": 16,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
|
|
@ -465,17 +469,17 @@
|
|||
"output_type": "stream",
|
||||
"text": [
|
||||
"Running test: model=keras_bert-base-cased_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=ACTIVE,batch_size=1,sequence_length=128,test_cases=100,test_times=1,contiguous=None,use_gpu=False,warmup=True\n",
|
||||
"Average latency = 99.01 ms, Throughput = 10.10 QPS\n",
|
||||
"Average latency = 97.93 ms, Throughput = 10.21 QPS\n",
|
||||
"test setting TestSetting(batch_size=1, sequence_length=128, test_cases=100, test_times=1, contiguous=None, use_gpu=False, warmup=True, omp_num_threads=12, omp_wait_policy='ACTIVE', intra_op_num_threads=1, seed=3, verbose=False, inclusive=False, extra_latency=True)\n",
|
||||
"Generating 100 samples for batch_size=1 sequence_length=128\n",
|
||||
"Test summary is saved to onnx_models\\perf_results_CPU_B1_S128_20200617-210258.txt\n"
|
||||
"Test summary is saved to onnx_models\\perf_results_CPU_B1_S128_20200728-165907.txt\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"THREAD_SETTING = '--intra_op_num_threads 1 --omp_num_threads {} --omp_wait_policy ACTIVE'.format(psutil.cpu_count(logical=True))\n",
|
||||
"\n",
|
||||
"!{sys.executable} -m onnxruntime_tools.transformers.bert_perf_test --model $optimized_model_path --batch_size 1 --sequence_length 128 --samples 100 --test_times 1 --inclusive $THREAD_SETTING\n"
|
||||
"!{sys.executable} -m onnxruntime_tools.transformers.bert_perf_test --model $optimized_model_path --batch_size 1 --sequence_length 128 --samples 100 --test_times 1 --inclusive $THREAD_SETTING"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
|
@ -487,14 +491,14 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"execution_count": 17,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"./onnx_models\\perf_results_CPU_B1_S128_20200617-210258.txt\n"
|
||||
"./onnx_models\\perf_results_CPU_B1_S128_20200728-165907.txt\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
|
@ -534,9 +538,9 @@
|
|||
" <td>12</td>\n",
|
||||
" <td>ACTIVE</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" <td>99.01</td>\n",
|
||||
" <td>130.11</td>\n",
|
||||
" <td>10.1</td>\n",
|
||||
" <td>97.93</td>\n",
|
||||
" <td>158.16</td>\n",
|
||||
" <td>10.21</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
|
|
@ -547,10 +551,10 @@
|
|||
"0 1 12 ACTIVE None \n",
|
||||
"\n",
|
||||
" Latency(ms) Latency_P99 Throughput(QPS) \n",
|
||||
"0 99.01 130.11 10.1 "
|
||||
"0 97.93 158.16 10.21 "
|
||||
]
|
||||
},
|
||||
"execution_count": 16,
|
||||
"execution_count": 17,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
|
|
@ -589,7 +593,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 17,
|
||||
"execution_count": 18,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
|
|
@ -602,7 +606,7 @@
|
|||
" \"devices\": [\n",
|
||||
" {\n",
|
||||
" \"memory_total\": 8589934592,\n",
|
||||
" \"memory_available\": 1643134976,\n",
|
||||
" \"memory_available\": 8480882688,\n",
|
||||
" \"name\": \"GeForce GTX 1070\"\n",
|
||||
" }\n",
|
||||
" ]\n",
|
||||
|
|
@ -618,12 +622,12 @@
|
|||
" },\n",
|
||||
" \"memory\": {\n",
|
||||
" \"total\": 16971259904,\n",
|
||||
" \"available\": 3282817024\n",
|
||||
" \"available\": 3480842240\n",
|
||||
" },\n",
|
||||
" \"python\": \"3.6.10.final.0 (64 bit)\",\n",
|
||||
" \"os\": \"Windows-10-10.0.18362-SP0\",\n",
|
||||
" \"onnxruntime\": {\n",
|
||||
" \"version\": \"1.3.0\",\n",
|
||||
" \"version\": \"1.4.0\",\n",
|
||||
" \"support_gpu\": false\n",
|
||||
" },\n",
|
||||
" \"pytorch\": {\n",
|
||||
|
|
@ -631,8 +635,8 @@
|
|||
" \"support_gpu\": false\n",
|
||||
" },\n",
|
||||
" \"tensorflow\": {\n",
|
||||
" \"version\": \"2.2.0\",\n",
|
||||
" \"git_version\": \"v2.2.0-rc4-8-g2b96f3662b\",\n",
|
||||
" \"version\": \"2.3.0\",\n",
|
||||
" \"git_version\": \"v2.3.0-rc2-23-gb36436b087\",\n",
|
||||
" \"support_gpu\": true\n",
|
||||
" }\n",
|
||||
"}\n"
|
||||
|
|
@ -642,7 +646,7 @@
|
|||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"2020-06-17 21:03:03.409601: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library cudart64_101.dll\n"
|
||||
"2020-07-28 16:59:18.638897: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library cudart64_101.dll\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
|
|
|
|||
Loading…
Reference in a new issue