From 1a12f510fc0c18519e545f04cf5358d65a1ea633 Mon Sep 17 00:00:00 2001 From: Ye Wang <52801275+wangyems@users.noreply.github.com> Date: Tue, 29 Sep 2020 22:58:28 -0700 Subject: [PATCH 1/2] Support T5 benchmarking in transformers tool (#5133) * init checkin * review comments * modify according to transformers release --- onnxruntime/python/tools/transformers/benchmark.py | 12 ++++++++++-- .../python/tools/transformers/huggingface_models.py | 10 +++++----- .../python/tools/transformers/onnx_exporter.py | 4 +++- 3 files changed, 18 insertions(+), 8 deletions(-) diff --git a/onnxruntime/python/tools/transformers/benchmark.py b/onnxruntime/python/tools/transformers/benchmark.py index 6ceafb2c06..53e80423be 100644 --- a/onnxruntime/python/tools/transformers/benchmark.py +++ b/onnxruntime/python/tools/transformers/benchmark.py @@ -286,9 +286,17 @@ def run_tensorflow(use_gpu, model_names, model_class, precision, batch_sizes, se input_ids = tf.constant(values, shape=(batch_size, sequence_length), dtype=tf.int32) try: - model(input_ids, training=False) + def encoder_forward(): + return model(input_ids, training=False) - runtimes = timeit.repeat(lambda: model(input_ids, training=False), repeat=repeat_times, number=1) + def encoder_decoder_forward(): + return model(input_ids, decoder_input_ids=input_ids, training=False) + + inference = encoder_decoder_forward if config.is_encoder_decoder else encoder_forward + + inference() + + runtimes = timeit.repeat(lambda: inference(), repeat=repeat_times, number=1) result = { "engine": "tensorflow", diff --git a/onnxruntime/python/tools/transformers/huggingface_models.py b/onnxruntime/python/tools/transformers/huggingface_models.py index 04c79ed3b6..91ce85e1fc 100644 --- a/onnxruntime/python/tools/transformers/huggingface_models.py +++ b/onnxruntime/python/tools/transformers/huggingface_models.py @@ -89,11 +89,11 @@ MODELS = { "albert-xlarge-v2": (["input_ids"], 12, True, "bert"), "albert-xxlarge-v2": (["input_ids"], 12, True, "bert"), # T5 - #"t5-small": (["input_ids"], 11, False, "bert"), - #"t5-base": (["input_ids"], 11, False, "bert"), - #"t5-large": (["input_ids"], 11, False, "bert"), - #"t5-3b": (["input_ids"], 11, False, "bert"), - #"t5-11b": (["input_ids"], 11, False, "bert"), + "t5-small": (["input_ids"], 12, False, "bert"), + "t5-base": (["input_ids"], 12, False, "bert"), + "t5-large": (["input_ids"], 12, True, "bert"), + "t5-3b": (["input_ids"], 12, True, "bert"), + "t5-11b": (["input_ids"], 12, True, "bert"), # XLM-RoBERTa "xlm-roberta-base": (["input_ids"], 11, False, "bert"), "xlm-roberta-large": (["input_ids"], 11, True, "bert"), diff --git a/onnxruntime/python/tools/transformers/onnx_exporter.py b/onnxruntime/python/tools/transformers/onnx_exporter.py index 0fa9e2d555..6f9ad1fc41 100644 --- a/onnxruntime/python/tools/transformers/onnx_exporter.py +++ b/onnxruntime/python/tools/transformers/onnx_exporter.py @@ -226,7 +226,9 @@ def load_pretrained_model(model_name, config, cache_dir, custom_model_class, is_ transformers_module = __import__("transformers", fromlist=[model_class_name]) model_class = getattr(transformers_module, model_class_name) - return model_class.from_pretrained(model_name, config=config, cache_dir=cache_dir) + use_cdn = False if model_name == 't5-11b' else True + + return model_class.from_pretrained(model_name, config=config, cache_dir=cache_dir, use_cdn=use_cdn) def validate_and_optimize_onnx(model_name, use_external_data_format, model_type, onnx_dir, input_names, use_gpu, From 69dbaaa015beff0e83cbfdc52363d1a1c9cdb51b Mon Sep 17 00:00:00 2001 From: Tim Harris Date: Wed, 30 Sep 2020 11:26:02 +0100 Subject: [PATCH 2/2] Add additional test cases to check for leaks in thread pool creation / destruction (#5311) Add additional test cases such as ThreadPoolTest.TestPoolCreation_10Iter to create and destroy thread pools to watch for any memory leaks. Running under Valgrind, these tests should show all of the data allocated being deallocated again. Two recent issues #5176 and #5292 indicated memory leaks. The test cases help identify whether or not any of the data structures used in the thread pool are being leaked. Currently, on WSL, the only data not being de-allocated in these tests are a small number of nsync waiter objects. This behavior is as expected (the waiter objects should be held on a free list in the nsync library). --- onnxruntime/test/platform/threadpool_test.cc | 37 ++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/onnxruntime/test/platform/threadpool_test.cc b/onnxruntime/test/platform/threadpool_test.cc index fda2cbbc49..34ca40b87e 100644 --- a/onnxruntime/test/platform/threadpool_test.cc +++ b/onnxruntime/test/platform/threadpool_test.cc @@ -137,6 +137,30 @@ void TestBurstScheduling(const std::string& name, int num_tasks) { } } +void TestPoolCreation(const std::string&, int iter) { + // Test creating and destroying thread pools. This can be used with Valgrind to help + // check for memory leaks related to the initialization and clean-up code. For instance + // + // valgrind --leak-check=full ./onnxruntime_test_all --gtest_filter=ThreadPoolTest.TestPoolCreation_10Iter + // + // We create #iter thread pools, and within each of them run a loop of #per_iter steps. + std::atomic ctr{0}; + constexpr std::ptrdiff_t per_iter = 1024; + constexpr int num_threads = 4; + for (auto i = 0; i < iter; i++) { + auto tp = onnxruntime::make_unique(&onnxruntime::Env::Default(), + onnxruntime::ThreadOptions(), + nullptr, + num_threads, + true); + tp->ParallelFor(per_iter, 0.0, + [&](std::ptrdiff_t s, std::ptrdiff_t e) { + ctr += e - s; + }); + } + ASSERT_EQ(ctr, iter * per_iter); +} + } // namespace namespace onnxruntime { @@ -253,6 +277,19 @@ TEST(ThreadPoolTest, TestBurstScheduling_65536Task) { // buffer tasks. TestBurstScheduling("TestBurstScheduling_65536Tasks", 65536); } + +TEST(ThreadPoolTest, TestPoolCreation_1Iter) { + TestPoolCreation("TestPoolCreation_1Iter", 1); +} + +TEST(ThreadPoolTest, TestPoolCreation_10Iter) { + TestPoolCreation("TestPoolCreation_10Iter", 10); +} + +TEST(ThreadPoolTest, TestPoolCreation_100Iter) { + TestPoolCreation("TestPoolCreation_100Iter", 100); +} + #ifdef _WIN32 TEST(ThreadPoolTest, TestStackSize) { ThreadOptions to;