Update notebook: disable GPU for tensorflow (#4649)

This commit is contained in:
Tianlei Wu 2020-07-29 10:09:06 -07:00 committed by GitHub
parent 623dd53eb7
commit 326cc686df
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23

View file

@ -54,11 +54,11 @@
"source": [
"import sys\n",
" \n",
"!{sys.executable} -m pip install --quiet --upgrade tensorflow==2.2.0\n",
"!{sys.executable} -m pip install --quiet --upgrade onnxruntime\n",
"!{sys.executable} -m pip install --quiet --upgrade onnxruntime-tools\n",
"!{sys.executable} -m pip install --quiet --upgrade keras2onnx\n",
"!{sys.executable} -m pip install --quiet transformers==2.11.0\n",
"!{sys.executable} -m pip install --quiet --upgrade tensorflow==2.3.0\n",
"!{sys.executable} -m pip install --quiet --upgrade onnxruntime==1.4.0\n",
"!{sys.executable} -m pip install --quiet --upgrade onnxruntime-tools==1.4.0\n",
"!{sys.executable} -m pip install --quiet --upgrade keras2onnx==1.7.0\n",
"!{sys.executable} -m pip install --quiet transformers==3.0.2\n",
"!{sys.executable} -m pip install --quiet wget pandas"
]
},
@ -92,13 +92,23 @@
"outputs": [],
"source": [
"import os\n",
"cache_dir = './cached_models'\n",
"cache_dir = './cache_models'\n",
"output_dir = './onnx_models'\n",
"for directory in [cache_dir, output_dir]:\n",
" if not os.path.exists(directory):\n",
" os.makedirs(directory)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"import tensorflow as tf\n",
"tf.config.set_visible_devices([], 'GPU') # Disable GPU for fair comparison"
]
},
{
"cell_type": "markdown",
"metadata": {},
@ -115,16 +125,29 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 5,
"metadata": {
"scrolled": true
},
"outputs": [],
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Some weights of the model checkpoint at bert-base-cased were not used when initializing TFBertForQuestionAnswering: ['nsp___cls', 'mlm___cls']\n",
"- This IS expected if you are initializing TFBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).\n",
"- This IS NOT expected if you are initializing TFBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
"Some weights of TFBertForQuestionAnswering were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['qa_outputs']\n",
"You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
]
}
],
"source": [
"from transformers import (TFBertForQuestionAnswering, BertTokenizer)\n",
"\n",
"#model_name_or_path = 'bert-large-uncased-whole-word-masking-finetuned-squad'\n",
"model_name_or_path = \"bert-base-cased\"\n",
"is_fine_tuned = (model_name_or_path == 'bert-large-uncased-whole-word-masking-finetuned-squad')\n",
"\n",
"# Load model and tokenizer\n",
"tokenizer = BertTokenizer.from_pretrained(model_name_or_path, do_lower_case=True, cache_dir=cache_dir)\n",
@ -144,42 +167,34 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The answer is: [CLS] what is on ##nx run ##time ? [SEP] on ##nx run ##time is a performance - focused in ##ference engine for on ##nx models\n"
]
}
],
"outputs": [],
"source": [
"import tensorflow as tf\n",
"import numpy\n",
"\n",
"question, text = \"What is ONNX Runtime?\", \"ONNX Runtime is a performance-focused inference engine for ONNX models.\"\n",
"# Pad to max length is needed. Otherwise, position embedding might be truncated by constant folding.\n",
"inputs = tokenizer.encode_plus(question, text, add_special_tokens=True, return_tensors='tf',\n",
" max_length=max_sequence_length, pad_to_max_length=True)\n",
" max_length=max_sequence_length, pad_to_max_length=True, truncation=True)\n",
"start_scores, end_scores = model(inputs)\n",
"\n",
"num_tokens = len(inputs[\"input_ids\"][0])\n",
"all_tokens = tokenizer.convert_ids_to_tokens(inputs[\"input_ids\"][0])\n",
"print(\"The answer is:\", ' '.join(all_tokens[numpy.argmax(start_scores) : numpy.argmax(end_scores)+1]))"
"if is_fine_tuned:\n",
" all_tokens = tokenizer.convert_ids_to_tokens(inputs[\"input_ids\"][0])\n",
" print(\"The answer is:\", ' '.join(all_tokens[numpy.argmax(start_scores) : numpy.argmax(end_scores)+1]))"
]
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Tensorflow Inference time for sequence length 512 = 94.62 ms\n"
"Tensorflow Inference time for sequence length 512 = 1133.13 ms\n"
]
}
],
@ -203,7 +218,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
@ -239,7 +254,7 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
@ -260,14 +275,14 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"ONNX Runtime cpu inference time for sequence length 512 (model not optimized): 630.54 ms\n"
"ONNX Runtime cpu inference time for sequence length 512 (model not optimized): 654.49 ms\n"
]
}
],
@ -280,7 +295,7 @@
"\n",
"# intra_op_num_threads=1 can be used to enable OpenMP in OnnxRuntime 1.2.0.\n",
"# For OnnxRuntime 1.3.0 or later, this does not have effect unless you are using onnxruntime-gpu package.\n",
"sess_options.intra_op_num_threads=1\n",
"# sess_options.intra_op_num_threads=1\n",
"\n",
"# Providers is optional. Only needed when you use onnxruntime-gpu for CPU inference.\n",
"session = onnxruntime.InferenceSession(output_model_path, sess_options, providers=['CPUExecutionProvider'])\n",
@ -302,26 +317,15 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"***** Verifying correctness (TensorFlow and ONNX Runtime) *****\n",
"WARNING:tensorflow:From <ipython-input-10-453158d8869f>:2: _EagerTensorBase.cpu (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.\n",
"Instructions for updating:\n",
"Use tf.identity instead.\n",
"start_scores are close: False\n",
"end_scores are close: False\n"
]
}
],
"outputs": [],
"source": [
"print(\"***** Verifying correctness (TensorFlow and ONNX Runtime) *****\")\n",
"print('start_scores are close:', numpy.allclose(results[0], start_scores.cpu(), rtol=1e-05, atol=1e-04))\n",
"print('end_scores are close:', numpy.allclose(results[1], end_scores.cpu(), rtol=1e-05, atol=1e-04))"
"# Some weights of TFBertForQuestionAnswering might not be initialized without fine-tuning.\n",
"if is_fine_tuned:\n",
" print(\"***** Verifying correctness (TensorFlow and ONNX Runtime) *****\")\n",
" print('start_scores are close:', numpy.allclose(results[0], start_scores.cpu(), rtol=1e-05, atol=1e-04))\n",
" print('end_scores are close:', numpy.allclose(results[1], end_scores.cpu(), rtol=1e-05, atol=1e-04))"
]
},
{
@ -346,7 +350,7 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
@ -367,14 +371,14 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"ONNX Runtime cpu inference time on optimized model: 369.18 ms\n"
"ONNX Runtime cpu inference time on optimized model: 328.48 ms\n"
]
}
],
@ -394,7 +398,7 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": 14,
"metadata": {},
"outputs": [
{
@ -402,15 +406,15 @@
"output_type": "stream",
"text": [
"***** Verifying correctness (before and after optimization) *****\n",
"start_scores are close: False\n",
"end_scores are close: False\n"
"start_scores are close: True\n",
"end_scores are close: True\n"
]
}
],
"source": [
"print(\"***** Verifying correctness (before and after optimization) *****\")\n",
"print('start_scores are close:', numpy.allclose(opt_results[0], start_scores.cpu(), rtol=1e-05, atol=1e-04))\n",
"print('end_scores are close:', numpy.allclose(opt_results[1], end_scores.cpu(), rtol=1e-05, atol=1e-04))"
"print('start_scores are close:', numpy.allclose(opt_results[0], results[0], rtol=1e-05, atol=1e-04))\n",
"print('end_scores are close:', numpy.allclose(opt_results[1], results[1], rtol=1e-05, atol=1e-04))"
]
},
{
@ -426,7 +430,7 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 15,
"metadata": {},
"outputs": [
{
@ -434,8 +438,8 @@
"output_type": "stream",
"text": [
"100% passed for 10 random inputs given thresholds (rtol=0.001, atol=0.0001).\n",
"maximum absolute difference=1.2461096048355103e-06\n",
"maximum relative difference=0.006510902661830187\n"
"maximum absolute difference=1.6242265701293945e-06\n",
"maximum relative difference=0.009154098108410835\n"
]
}
],
@ -457,7 +461,7 @@
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": 16,
"metadata": {},
"outputs": [
{
@ -465,17 +469,17 @@
"output_type": "stream",
"text": [
"Running test: model=keras_bert-base-cased_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=ACTIVE,batch_size=1,sequence_length=128,test_cases=100,test_times=1,contiguous=None,use_gpu=False,warmup=True\n",
"Average latency = 99.01 ms, Throughput = 10.10 QPS\n",
"Average latency = 97.93 ms, Throughput = 10.21 QPS\n",
"test setting TestSetting(batch_size=1, sequence_length=128, test_cases=100, test_times=1, contiguous=None, use_gpu=False, warmup=True, omp_num_threads=12, omp_wait_policy='ACTIVE', intra_op_num_threads=1, seed=3, verbose=False, inclusive=False, extra_latency=True)\n",
"Generating 100 samples for batch_size=1 sequence_length=128\n",
"Test summary is saved to onnx_models\\perf_results_CPU_B1_S128_20200617-210258.txt\n"
"Test summary is saved to onnx_models\\perf_results_CPU_B1_S128_20200728-165907.txt\n"
]
}
],
"source": [
"THREAD_SETTING = '--intra_op_num_threads 1 --omp_num_threads {} --omp_wait_policy ACTIVE'.format(psutil.cpu_count(logical=True))\n",
"\n",
"!{sys.executable} -m onnxruntime_tools.transformers.bert_perf_test --model $optimized_model_path --batch_size 1 --sequence_length 128 --samples 100 --test_times 1 --inclusive $THREAD_SETTING\n"
"!{sys.executable} -m onnxruntime_tools.transformers.bert_perf_test --model $optimized_model_path --batch_size 1 --sequence_length 128 --samples 100 --test_times 1 --inclusive $THREAD_SETTING"
]
},
{
@ -487,14 +491,14 @@
},
{
"cell_type": "code",
"execution_count": 16,
"execution_count": 17,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"./onnx_models\\perf_results_CPU_B1_S128_20200617-210258.txt\n"
"./onnx_models\\perf_results_CPU_B1_S128_20200728-165907.txt\n"
]
},
{
@ -534,9 +538,9 @@
" <td>12</td>\n",
" <td>ACTIVE</td>\n",
" <td>None</td>\n",
" <td>99.01</td>\n",
" <td>130.11</td>\n",
" <td>10.1</td>\n",
" <td>97.93</td>\n",
" <td>158.16</td>\n",
" <td>10.21</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
@ -547,10 +551,10 @@
"0 1 12 ACTIVE None \n",
"\n",
" Latency(ms) Latency_P99 Throughput(QPS) \n",
"0 99.01 130.11 10.1 "
"0 97.93 158.16 10.21 "
]
},
"execution_count": 16,
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
@ -589,7 +593,7 @@
},
{
"cell_type": "code",
"execution_count": 17,
"execution_count": 18,
"metadata": {},
"outputs": [
{
@ -602,7 +606,7 @@
" \"devices\": [\n",
" {\n",
" \"memory_total\": 8589934592,\n",
" \"memory_available\": 1643134976,\n",
" \"memory_available\": 8480882688,\n",
" \"name\": \"GeForce GTX 1070\"\n",
" }\n",
" ]\n",
@ -618,12 +622,12 @@
" },\n",
" \"memory\": {\n",
" \"total\": 16971259904,\n",
" \"available\": 3282817024\n",
" \"available\": 3480842240\n",
" },\n",
" \"python\": \"3.6.10.final.0 (64 bit)\",\n",
" \"os\": \"Windows-10-10.0.18362-SP0\",\n",
" \"onnxruntime\": {\n",
" \"version\": \"1.3.0\",\n",
" \"version\": \"1.4.0\",\n",
" \"support_gpu\": false\n",
" },\n",
" \"pytorch\": {\n",
@ -631,8 +635,8 @@
" \"support_gpu\": false\n",
" },\n",
" \"tensorflow\": {\n",
" \"version\": \"2.2.0\",\n",
" \"git_version\": \"v2.2.0-rc4-8-g2b96f3662b\",\n",
" \"version\": \"2.3.0\",\n",
" \"git_version\": \"v2.3.0-rc2-23-gb36436b087\",\n",
" \"support_gpu\": true\n",
" }\n",
"}\n"
@ -642,7 +646,7 @@
"name": "stderr",
"output_type": "stream",
"text": [
"2020-06-17 21:03:03.409601: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library cudart64_101.dll\n"
"2020-07-28 16:59:18.638897: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library cudart64_101.dll\n"
]
}
],