diff --git a/onnxruntime/python/tools/transformers/notebooks/Tensorflow_Keras_Bert-Squad_OnnxRuntime_CPU.ipynb b/onnxruntime/python/tools/transformers/notebooks/Tensorflow_Keras_Bert-Squad_OnnxRuntime_CPU.ipynb index ea7b5f7ab8..caba3614fa 100644 --- a/onnxruntime/python/tools/transformers/notebooks/Tensorflow_Keras_Bert-Squad_OnnxRuntime_CPU.ipynb +++ b/onnxruntime/python/tools/transformers/notebooks/Tensorflow_Keras_Bert-Squad_OnnxRuntime_CPU.ipynb @@ -54,11 +54,11 @@ "source": [ "import sys\n", " \n", - "!{sys.executable} -m pip install --quiet --upgrade tensorflow==2.2.0\n", - "!{sys.executable} -m pip install --quiet --upgrade onnxruntime\n", - "!{sys.executable} -m pip install --quiet --upgrade onnxruntime-tools\n", - "!{sys.executable} -m pip install --quiet --upgrade keras2onnx\n", - "!{sys.executable} -m pip install --quiet transformers==2.11.0\n", + "!{sys.executable} -m pip install --quiet --upgrade tensorflow==2.3.0\n", + "!{sys.executable} -m pip install --quiet --upgrade onnxruntime==1.4.0\n", + "!{sys.executable} -m pip install --quiet --upgrade onnxruntime-tools==1.4.0\n", + "!{sys.executable} -m pip install --quiet --upgrade keras2onnx==1.7.0\n", + "!{sys.executable} -m pip install --quiet transformers==3.0.2\n", "!{sys.executable} -m pip install --quiet wget pandas" ] }, @@ -92,13 +92,23 @@ "outputs": [], "source": [ "import os\n", - "cache_dir = './cached_models'\n", + "cache_dir = './cache_models'\n", "output_dir = './onnx_models'\n", "for directory in [cache_dir, output_dir]:\n", " if not os.path.exists(directory):\n", " os.makedirs(directory)" ] }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "import tensorflow as tf\n", + "tf.config.set_visible_devices([], 'GPU') # Disable GPU for fair comparison" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -115,16 +125,29 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "metadata": { "scrolled": true }, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Some weights of the model checkpoint at bert-base-cased were not used when initializing TFBertForQuestionAnswering: ['nsp___cls', 'mlm___cls']\n", + "- This IS expected if you are initializing TFBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).\n", + "- This IS NOT expected if you are initializing TFBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n", + "Some weights of TFBertForQuestionAnswering were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['qa_outputs']\n", + "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" + ] + } + ], "source": [ "from transformers import (TFBertForQuestionAnswering, BertTokenizer)\n", "\n", "#model_name_or_path = 'bert-large-uncased-whole-word-masking-finetuned-squad'\n", "model_name_or_path = \"bert-base-cased\"\n", + "is_fine_tuned = (model_name_or_path == 'bert-large-uncased-whole-word-masking-finetuned-squad')\n", "\n", "# Load model and tokenizer\n", "tokenizer = BertTokenizer.from_pretrained(model_name_or_path, do_lower_case=True, cache_dir=cache_dir)\n", @@ -144,42 +167,34 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The answer is: [CLS] what is on ##nx run ##time ? [SEP] on ##nx run ##time is a performance - focused in ##ference engine for on ##nx models\n" - ] - } - ], + "outputs": [], "source": [ - "import tensorflow as tf\n", "import numpy\n", "\n", "question, text = \"What is ONNX Runtime?\", \"ONNX Runtime is a performance-focused inference engine for ONNX models.\"\n", "# Pad to max length is needed. Otherwise, position embedding might be truncated by constant folding.\n", "inputs = tokenizer.encode_plus(question, text, add_special_tokens=True, return_tensors='tf',\n", - " max_length=max_sequence_length, pad_to_max_length=True)\n", + " max_length=max_sequence_length, pad_to_max_length=True, truncation=True)\n", "start_scores, end_scores = model(inputs)\n", "\n", "num_tokens = len(inputs[\"input_ids\"][0])\n", - "all_tokens = tokenizer.convert_ids_to_tokens(inputs[\"input_ids\"][0])\n", - "print(\"The answer is:\", ' '.join(all_tokens[numpy.argmax(start_scores) : numpy.argmax(end_scores)+1]))" + "if is_fine_tuned:\n", + " all_tokens = tokenizer.convert_ids_to_tokens(inputs[\"input_ids\"][0])\n", + " print(\"The answer is:\", ' '.join(all_tokens[numpy.argmax(start_scores) : numpy.argmax(end_scores)+1]))" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Tensorflow Inference time for sequence length 512 = 94.62 ms\n" + "Tensorflow Inference time for sequence length 512 = 1133.13 ms\n" ] } ], @@ -203,7 +218,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -239,7 +254,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -260,14 +275,14 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "ONNX Runtime cpu inference time for sequence length 512 (model not optimized): 630.54 ms\n" + "ONNX Runtime cpu inference time for sequence length 512 (model not optimized): 654.49 ms\n" ] } ], @@ -280,7 +295,7 @@ "\n", "# intra_op_num_threads=1 can be used to enable OpenMP in OnnxRuntime 1.2.0.\n", "# For OnnxRuntime 1.3.0 or later, this does not have effect unless you are using onnxruntime-gpu package.\n", - "sess_options.intra_op_num_threads=1\n", + "# sess_options.intra_op_num_threads=1\n", "\n", "# Providers is optional. Only needed when you use onnxruntime-gpu for CPU inference.\n", "session = onnxruntime.InferenceSession(output_model_path, sess_options, providers=['CPUExecutionProvider'])\n", @@ -302,26 +317,15 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 11, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "***** Verifying correctness (TensorFlow and ONNX Runtime) *****\n", - "WARNING:tensorflow:From :2: _EagerTensorBase.cpu (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.\n", - "Instructions for updating:\n", - "Use tf.identity instead.\n", - "start_scores are close: False\n", - "end_scores are close: False\n" - ] - } - ], + "outputs": [], "source": [ - "print(\"***** Verifying correctness (TensorFlow and ONNX Runtime) *****\")\n", - "print('start_scores are close:', numpy.allclose(results[0], start_scores.cpu(), rtol=1e-05, atol=1e-04))\n", - "print('end_scores are close:', numpy.allclose(results[1], end_scores.cpu(), rtol=1e-05, atol=1e-04))" + "# Some weights of TFBertForQuestionAnswering might not be initialized without fine-tuning.\n", + "if is_fine_tuned:\n", + " print(\"***** Verifying correctness (TensorFlow and ONNX Runtime) *****\")\n", + " print('start_scores are close:', numpy.allclose(results[0], start_scores.cpu(), rtol=1e-05, atol=1e-04))\n", + " print('end_scores are close:', numpy.allclose(results[1], end_scores.cpu(), rtol=1e-05, atol=1e-04))" ] }, { @@ -346,7 +350,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ @@ -367,14 +371,14 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "ONNX Runtime cpu inference time on optimized model: 369.18 ms\n" + "ONNX Runtime cpu inference time on optimized model: 328.48 ms\n" ] } ], @@ -394,7 +398,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 14, "metadata": {}, "outputs": [ { @@ -402,15 +406,15 @@ "output_type": "stream", "text": [ "***** Verifying correctness (before and after optimization) *****\n", - "start_scores are close: False\n", - "end_scores are close: False\n" + "start_scores are close: True\n", + "end_scores are close: True\n" ] } ], "source": [ "print(\"***** Verifying correctness (before and after optimization) *****\")\n", - "print('start_scores are close:', numpy.allclose(opt_results[0], start_scores.cpu(), rtol=1e-05, atol=1e-04))\n", - "print('end_scores are close:', numpy.allclose(opt_results[1], end_scores.cpu(), rtol=1e-05, atol=1e-04))" + "print('start_scores are close:', numpy.allclose(opt_results[0], results[0], rtol=1e-05, atol=1e-04))\n", + "print('end_scores are close:', numpy.allclose(opt_results[1], results[1], rtol=1e-05, atol=1e-04))" ] }, { @@ -426,7 +430,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 15, "metadata": {}, "outputs": [ { @@ -434,8 +438,8 @@ "output_type": "stream", "text": [ "100% passed for 10 random inputs given thresholds (rtol=0.001, atol=0.0001).\n", - "maximum absolute difference=1.2461096048355103e-06\n", - "maximum relative difference=0.006510902661830187\n" + "maximum absolute difference=1.6242265701293945e-06\n", + "maximum relative difference=0.009154098108410835\n" ] } ], @@ -457,7 +461,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 16, "metadata": {}, "outputs": [ { @@ -465,17 +469,17 @@ "output_type": "stream", "text": [ "Running test: model=keras_bert-base-cased_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=ACTIVE,batch_size=1,sequence_length=128,test_cases=100,test_times=1,contiguous=None,use_gpu=False,warmup=True\n", - "Average latency = 99.01 ms, Throughput = 10.10 QPS\n", + "Average latency = 97.93 ms, Throughput = 10.21 QPS\n", "test setting TestSetting(batch_size=1, sequence_length=128, test_cases=100, test_times=1, contiguous=None, use_gpu=False, warmup=True, omp_num_threads=12, omp_wait_policy='ACTIVE', intra_op_num_threads=1, seed=3, verbose=False, inclusive=False, extra_latency=True)\n", "Generating 100 samples for batch_size=1 sequence_length=128\n", - "Test summary is saved to onnx_models\\perf_results_CPU_B1_S128_20200617-210258.txt\n" + "Test summary is saved to onnx_models\\perf_results_CPU_B1_S128_20200728-165907.txt\n" ] } ], "source": [ "THREAD_SETTING = '--intra_op_num_threads 1 --omp_num_threads {} --omp_wait_policy ACTIVE'.format(psutil.cpu_count(logical=True))\n", "\n", - "!{sys.executable} -m onnxruntime_tools.transformers.bert_perf_test --model $optimized_model_path --batch_size 1 --sequence_length 128 --samples 100 --test_times 1 --inclusive $THREAD_SETTING\n" + "!{sys.executable} -m onnxruntime_tools.transformers.bert_perf_test --model $optimized_model_path --batch_size 1 --sequence_length 128 --samples 100 --test_times 1 --inclusive $THREAD_SETTING" ] }, { @@ -487,14 +491,14 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "./onnx_models\\perf_results_CPU_B1_S128_20200617-210258.txt\n" + "./onnx_models\\perf_results_CPU_B1_S128_20200728-165907.txt\n" ] }, { @@ -534,9 +538,9 @@ " 12\n", " ACTIVE\n", " None\n", - " 99.01\n", - " 130.11\n", - " 10.1\n", + " 97.93\n", + " 158.16\n", + " 10.21\n", " \n", " \n", "\n", @@ -547,10 +551,10 @@ "0 1 12 ACTIVE None \n", "\n", " Latency(ms) Latency_P99 Throughput(QPS) \n", - "0 99.01 130.11 10.1 " + "0 97.93 158.16 10.21 " ] }, - "execution_count": 16, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -589,7 +593,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 18, "metadata": {}, "outputs": [ { @@ -602,7 +606,7 @@ " \"devices\": [\n", " {\n", " \"memory_total\": 8589934592,\n", - " \"memory_available\": 1643134976,\n", + " \"memory_available\": 8480882688,\n", " \"name\": \"GeForce GTX 1070\"\n", " }\n", " ]\n", @@ -618,12 +622,12 @@ " },\n", " \"memory\": {\n", " \"total\": 16971259904,\n", - " \"available\": 3282817024\n", + " \"available\": 3480842240\n", " },\n", " \"python\": \"3.6.10.final.0 (64 bit)\",\n", " \"os\": \"Windows-10-10.0.18362-SP0\",\n", " \"onnxruntime\": {\n", - " \"version\": \"1.3.0\",\n", + " \"version\": \"1.4.0\",\n", " \"support_gpu\": false\n", " },\n", " \"pytorch\": {\n", @@ -631,8 +635,8 @@ " \"support_gpu\": false\n", " },\n", " \"tensorflow\": {\n", - " \"version\": \"2.2.0\",\n", - " \"git_version\": \"v2.2.0-rc4-8-g2b96f3662b\",\n", + " \"version\": \"2.3.0\",\n", + " \"git_version\": \"v2.3.0-rc2-23-gb36436b087\",\n", " \"support_gpu\": true\n", " }\n", "}\n" @@ -642,7 +646,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "2020-06-17 21:03:03.409601: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library cudart64_101.dll\n" + "2020-07-28 16:59:18.638897: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library cudart64_101.dll\n" ] } ],