Update notebook: disable GPU for tensorflow (#4649)

2026-07-11 17:48:34 +00:00 · 2020-07-29 10:09:06 -07:00 · 2020-07-29 10:09:06 -07:00 · 326cc686df
commit 326cc686df
parent 623dd53eb7
1 changed files with 80 additions and 76 deletions
--- a/onnxruntime/python/tools/transformers/notebooks/Tensorflow_Keras_Bert-Squad_OnnxRuntime_CPU.ipynb
+++ b/onnxruntime/python/tools/transformers/notebooks/Tensorflow_Keras_Bert-Squad_OnnxRuntime_CPU.ipynb
@ -54,11 +54,11 @@
   "source": [
    "import sys\n",
    " \n",
-    "!{sys.executable} -m pip install --quiet --upgrade tensorflow==2.2.0\n",
-    "!{sys.executable} -m pip install --quiet --upgrade onnxruntime\n",
-    "!{sys.executable} -m pip install --quiet --upgrade onnxruntime-tools\n",
-    "!{sys.executable} -m pip install --quiet --upgrade keras2onnx\n",
-    "!{sys.executable} -m pip install --quiet transformers==2.11.0\n",
+    "!{sys.executable} -m pip install --quiet --upgrade tensorflow==2.3.0\n",
+    "!{sys.executable} -m pip install --quiet --upgrade onnxruntime==1.4.0\n",
+    "!{sys.executable} -m pip install --quiet --upgrade onnxruntime-tools==1.4.0\n",
+    "!{sys.executable} -m pip install --quiet --upgrade keras2onnx==1.7.0\n",
+    "!{sys.executable} -m pip install --quiet transformers==3.0.2\n",
    "!{sys.executable} -m pip install --quiet wget pandas"
   ]
  },
@ -92,13 +92,23 @@
   "outputs": [],
   "source": [
    "import os\n",
-    "cache_dir = './cached_models'\n",
+    "cache_dir = './cache_models'\n",
    "output_dir = './onnx_models'\n",
    "for directory in [cache_dir, output_dir]:\n",
    "    if not os.path.exists(directory):\n",
    "        os.makedirs(directory)"
   ]
  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import tensorflow as tf\n",
+    "tf.config.set_visible_devices([], 'GPU') # Disable GPU for fair comparison"
+   ]
+  },
  {
   "cell_type": "markdown",
   "metadata": {},
@ -115,16 +125,29 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 5,
   "metadata": {
    "scrolled": true
   },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Some weights of the model checkpoint at bert-base-cased were not used when initializing TFBertForQuestionAnswering: ['nsp___cls', 'mlm___cls']\n",
+      "- This IS expected if you are initializing TFBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).\n",
+      "- This IS NOT expected if you are initializing TFBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
+      "Some weights of TFBertForQuestionAnswering were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['qa_outputs']\n",
+      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
+     ]
+    }
+   ],
   "source": [
    "from transformers import (TFBertForQuestionAnswering, BertTokenizer)\n",
    "\n",
    "#model_name_or_path = 'bert-large-uncased-whole-word-masking-finetuned-squad'\n",
    "model_name_or_path = \"bert-base-cased\"\n",
+    "is_fine_tuned = (model_name_or_path == 'bert-large-uncased-whole-word-masking-finetuned-squad')\n",
    "\n",
    "# Load model and tokenizer\n",
    "tokenizer = BertTokenizer.from_pretrained(model_name_or_path, do_lower_case=True, cache_dir=cache_dir)\n",
@ -144,42 +167,34 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 6,
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "The answer is: [CLS] what is on ##nx run ##time ? [SEP] on ##nx run ##time is a performance - focused in ##ference engine for on ##nx models\n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
-    "import tensorflow as tf\n",
    "import numpy\n",
    "\n",
    "question, text = \"What is ONNX Runtime?\", \"ONNX Runtime is a performance-focused inference engine for ONNX models.\"\n",
    "# Pad to max length is needed. Otherwise, position embedding might be truncated by constant folding.\n",
    "inputs = tokenizer.encode_plus(question, text, add_special_tokens=True, return_tensors='tf',\n",
-    "                               max_length=max_sequence_length, pad_to_max_length=True)\n",
+    "                               max_length=max_sequence_length, pad_to_max_length=True, truncation=True)\n",
    "start_scores, end_scores = model(inputs)\n",
    "\n",
    "num_tokens = len(inputs[\"input_ids\"][0])\n",
-    "all_tokens = tokenizer.convert_ids_to_tokens(inputs[\"input_ids\"][0])\n",
-    "print(\"The answer is:\", ' '.join(all_tokens[numpy.argmax(start_scores) : numpy.argmax(end_scores)+1]))"
+    "if is_fine_tuned:\n",
+    "    all_tokens = tokenizer.convert_ids_to_tokens(inputs[\"input_ids\"][0])\n",
+    "    print(\"The answer is:\", ' '.join(all_tokens[numpy.argmax(start_scores) : numpy.argmax(end_scores)+1]))"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Tensorflow Inference time for sequence length 512 = 94.62 ms\n"
+      "Tensorflow Inference time for sequence length 512 = 1133.13 ms\n"
     ]
    }
   ],
@ -203,7 +218,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
@ -239,7 +254,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
@ -260,14 +275,14 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "ONNX Runtime cpu inference time for sequence length 512 (model not optimized): 630.54 ms\n"
+      "ONNX Runtime cpu inference time for sequence length 512 (model not optimized): 654.49 ms\n"
     ]
    }
   ],
@ -280,7 +295,7 @@
    "\n",
    "# intra_op_num_threads=1 can be used to enable OpenMP in OnnxRuntime 1.2.0.\n",
    "# For OnnxRuntime 1.3.0 or later, this does not have effect unless you are using onnxruntime-gpu package.\n",
-    "sess_options.intra_op_num_threads=1\n",
+    "# sess_options.intra_op_num_threads=1\n",
    "\n",
    "# Providers is optional. Only needed when you use onnxruntime-gpu for CPU inference.\n",
    "session = onnxruntime.InferenceSession(output_model_path, sess_options, providers=['CPUExecutionProvider'])\n",
@ -302,26 +317,15 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 11,
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "***** Verifying correctness (TensorFlow and ONNX Runtime) *****\n",
-      "WARNING:tensorflow:From <ipython-input-10-453158d8869f>:2: _EagerTensorBase.cpu (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.\n",
-      "Instructions for updating:\n",
-      "Use tf.identity instead.\n",
-      "start_scores are close: False\n",
-      "end_scores are close: False\n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
-    "print(\"***** Verifying correctness (TensorFlow and ONNX Runtime) *****\")\n",
-    "print('start_scores are close:', numpy.allclose(results[0], start_scores.cpu(), rtol=1e-05, atol=1e-04))\n",
-    "print('end_scores are close:', numpy.allclose(results[1], end_scores.cpu(), rtol=1e-05, atol=1e-04))"
+    "# Some weights of TFBertForQuestionAnswering might not be initialized without fine-tuning.\n",
+    "if is_fine_tuned:\n",
+    "    print(\"***** Verifying correctness (TensorFlow and ONNX Runtime) *****\")\n",
+    "    print('start_scores are close:', numpy.allclose(results[0], start_scores.cpu(), rtol=1e-05, atol=1e-04))\n",
+    "    print('end_scores are close:', numpy.allclose(results[1], end_scores.cpu(), rtol=1e-05, atol=1e-04))"
   ]
  },
  {
@ -346,7 +350,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
@ -367,14 +371,14 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "ONNX Runtime cpu inference time on optimized model: 369.18 ms\n"
+      "ONNX Runtime cpu inference time on optimized model: 328.48 ms\n"
     ]
    }
   ],
@ -394,7 +398,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
@ -402,15 +406,15 @@
     "output_type": "stream",
     "text": [
      "***** Verifying correctness (before and after optimization) *****\n",
-      "start_scores are close: False\n",
-      "end_scores are close: False\n"
+      "start_scores are close: True\n",
+      "end_scores are close: True\n"
     ]
    }
   ],
   "source": [
    "print(\"***** Verifying correctness (before and after optimization) *****\")\n",
-    "print('start_scores are close:', numpy.allclose(opt_results[0], start_scores.cpu(), rtol=1e-05, atol=1e-04))\n",
-    "print('end_scores are close:', numpy.allclose(opt_results[1], end_scores.cpu(), rtol=1e-05, atol=1e-04))"
+    "print('start_scores are close:', numpy.allclose(opt_results[0], results[0], rtol=1e-05, atol=1e-04))\n",
+    "print('end_scores are close:', numpy.allclose(opt_results[1], results[1], rtol=1e-05, atol=1e-04))"
   ]
  },
  {
@ -426,7 +430,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
@ -434,8 +438,8 @@
     "output_type": "stream",
     "text": [
      "100% passed for 10 random inputs given thresholds (rtol=0.001, atol=0.0001).\n",
-      "maximum absolute difference=1.2461096048355103e-06\n",
-      "maximum relative difference=0.006510902661830187\n"
+      "maximum absolute difference=1.6242265701293945e-06\n",
+      "maximum relative difference=0.009154098108410835\n"
     ]
    }
   ],
@ -457,7 +461,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
@ -465,17 +469,17 @@
     "output_type": "stream",
     "text": [
      "Running test: model=keras_bert-base-cased_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=ACTIVE,batch_size=1,sequence_length=128,test_cases=100,test_times=1,contiguous=None,use_gpu=False,warmup=True\n",
-      "Average latency = 99.01 ms, Throughput = 10.10 QPS\n",
+      "Average latency = 97.93 ms, Throughput = 10.21 QPS\n",
      "test setting TestSetting(batch_size=1, sequence_length=128, test_cases=100, test_times=1, contiguous=None, use_gpu=False, warmup=True, omp_num_threads=12, omp_wait_policy='ACTIVE', intra_op_num_threads=1, seed=3, verbose=False, inclusive=False, extra_latency=True)\n",
      "Generating 100 samples for batch_size=1 sequence_length=128\n",
-      "Test summary is saved to onnx_models\\perf_results_CPU_B1_S128_20200617-210258.txt\n"
+      "Test summary is saved to onnx_models\\perf_results_CPU_B1_S128_20200728-165907.txt\n"
     ]
    }
   ],
   "source": [
    "THREAD_SETTING = '--intra_op_num_threads 1 --omp_num_threads {} --omp_wait_policy ACTIVE'.format(psutil.cpu_count(logical=True))\n",
    "\n",
-    "!{sys.executable} -m onnxruntime_tools.transformers.bert_perf_test --model $optimized_model_path --batch_size 1 --sequence_length 128 --samples 100 --test_times 1 --inclusive $THREAD_SETTING\n"
+    "!{sys.executable} -m onnxruntime_tools.transformers.bert_perf_test --model $optimized_model_path --batch_size 1 --sequence_length 128 --samples 100 --test_times 1 --inclusive $THREAD_SETTING"
   ]
  },
  {
@ -487,14 +491,14 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "./onnx_models\\perf_results_CPU_B1_S128_20200617-210258.txt\n"
+      "./onnx_models\\perf_results_CPU_B1_S128_20200728-165907.txt\n"
     ]
    },
    {
@ -534,9 +538,9 @@
       "      <td>12</td>\n",
       "      <td>ACTIVE</td>\n",
       "      <td>None</td>\n",
-       "      <td>99.01</td>\n",
-       "      <td>130.11</td>\n",
-       "      <td>10.1</td>\n",
+       "      <td>97.93</td>\n",
+       "      <td>158.16</td>\n",
+       "      <td>10.21</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
@ -547,10 +551,10 @@
       "0                     1              12          ACTIVE       None   \n",
       "\n",
       "   Latency(ms)  Latency_P99  Throughput(QPS)  \n",
-       "0        99.01       130.11             10.1  "
+       "0        97.93       158.16            10.21  "
      ]
     },
-     "execution_count": 16,
+     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -589,7 +593,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
@ -602,7 +606,7 @@
      "    \"devices\": [\n",
      "      {\n",
      "        \"memory_total\": 8589934592,\n",
-      "        \"memory_available\": 1643134976,\n",
+      "        \"memory_available\": 8480882688,\n",
      "        \"name\": \"GeForce GTX 1070\"\n",
      "      }\n",
      "    ]\n",
@ -618,12 +622,12 @@
      "  },\n",
      "  \"memory\": {\n",
      "    \"total\": 16971259904,\n",
-      "    \"available\": 3282817024\n",
+      "    \"available\": 3480842240\n",
      "  },\n",
      "  \"python\": \"3.6.10.final.0 (64 bit)\",\n",
      "  \"os\": \"Windows-10-10.0.18362-SP0\",\n",
      "  \"onnxruntime\": {\n",
-      "    \"version\": \"1.3.0\",\n",
+      "    \"version\": \"1.4.0\",\n",
      "    \"support_gpu\": false\n",
      "  },\n",
      "  \"pytorch\": {\n",
@ -631,8 +635,8 @@
      "    \"support_gpu\": false\n",
      "  },\n",
      "  \"tensorflow\": {\n",
-      "    \"version\": \"2.2.0\",\n",
-      "    \"git_version\": \"v2.2.0-rc4-8-g2b96f3662b\",\n",
+      "    \"version\": \"2.3.0\",\n",
+      "    \"git_version\": \"v2.3.0-rc2-23-gb36436b087\",\n",
      "    \"support_gpu\": true\n",
      "  }\n",
      "}\n"
@ -642,7 +646,7 @@
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "2020-06-17 21:03:03.409601: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library cudart64_101.dll\n"
+      "2020-07-28 16:59:18.638897: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library cudart64_101.dll\n"
     ]
    }
   ],