diff --git a/onnxruntime/python/tools/transformers/benchmark.py b/onnxruntime/python/tools/transformers/benchmark.py
index 6ceafb2c06..53e80423be 100644
--- a/onnxruntime/python/tools/transformers/benchmark.py
+++ b/onnxruntime/python/tools/transformers/benchmark.py
@@ -286,9 +286,17 @@ def run_tensorflow(use_gpu, model_names, model_class, precision, batch_sizes, se
                 input_ids = tf.constant(values, shape=(batch_size, sequence_length), dtype=tf.int32)
 
                 try:
-                    model(input_ids, training=False)
+                    def encoder_forward():
+                        return model(input_ids, training=False)
 
-                    runtimes = timeit.repeat(lambda: model(input_ids, training=False), repeat=repeat_times, number=1)
+                    def encoder_decoder_forward():
+                        return model(input_ids, decoder_input_ids=input_ids, training=False)
+
+                    inference = encoder_decoder_forward if config.is_encoder_decoder else encoder_forward
+
+                    inference()
+
+                    runtimes = timeit.repeat(lambda: inference(), repeat=repeat_times, number=1)
 
                     result = {
                         "engine": "tensorflow",
diff --git a/onnxruntime/python/tools/transformers/huggingface_models.py b/onnxruntime/python/tools/transformers/huggingface_models.py
index 04c79ed3b6..91ce85e1fc 100644
--- a/onnxruntime/python/tools/transformers/huggingface_models.py
+++ b/onnxruntime/python/tools/transformers/huggingface_models.py
@@ -89,11 +89,11 @@ MODELS = {
     "albert-xlarge-v2": (["input_ids"], 12, True, "bert"),
     "albert-xxlarge-v2": (["input_ids"], 12, True, "bert"),
     # T5
-    #"t5-small": (["input_ids"], 11, False, "bert"),
-    #"t5-base": (["input_ids"], 11, False, "bert"),
-    #"t5-large": (["input_ids"], 11, False, "bert"),
-    #"t5-3b": (["input_ids"], 11, False, "bert"),
-    #"t5-11b": (["input_ids"], 11, False, "bert"),
+    "t5-small": (["input_ids"], 12, False, "bert"),
+    "t5-base": (["input_ids"], 12, False, "bert"),
+    "t5-large": (["input_ids"], 12, True, "bert"),
+    "t5-3b": (["input_ids"], 12, True, "bert"),
+    "t5-11b": (["input_ids"], 12, True, "bert"),
     # XLM-RoBERTa
     "xlm-roberta-base": (["input_ids"], 11, False, "bert"),
     "xlm-roberta-large": (["input_ids"], 11, True, "bert"),
diff --git a/onnxruntime/python/tools/transformers/onnx_exporter.py b/onnxruntime/python/tools/transformers/onnx_exporter.py
index 0fa9e2d555..6f9ad1fc41 100644
--- a/onnxruntime/python/tools/transformers/onnx_exporter.py
+++ b/onnxruntime/python/tools/transformers/onnx_exporter.py
@@ -226,7 +226,9 @@ def load_pretrained_model(model_name, config, cache_dir, custom_model_class, is_
     transformers_module = __import__("transformers", fromlist=[model_class_name])
     model_class = getattr(transformers_module, model_class_name)
 
-    return model_class.from_pretrained(model_name, config=config, cache_dir=cache_dir)
+    use_cdn = False if model_name == 't5-11b' else True
+
+    return model_class.from_pretrained(model_name, config=config, cache_dir=cache_dir, use_cdn=use_cdn)
 
 
 def validate_and_optimize_onnx(model_name, use_external_data_format, model_type, onnx_dir, input_names, use_gpu,
diff --git a/onnxruntime/test/platform/threadpool_test.cc b/onnxruntime/test/platform/threadpool_test.cc
index fda2cbbc49..34ca40b87e 100644
--- a/onnxruntime/test/platform/threadpool_test.cc
+++ b/onnxruntime/test/platform/threadpool_test.cc
@@ -137,6 +137,30 @@ void TestBurstScheduling(const std::string& name, int num_tasks) {
   }
 }
 
+void TestPoolCreation(const std::string&, int iter) {
+  // Test creating and destroying thread pools.  This can be used with Valgrind to help
+  // check for memory leaks related to the initialization and clean-up code.  For instance
+  //
+  //  valgrind --leak-check=full ./onnxruntime_test_all --gtest_filter=ThreadPoolTest.TestPoolCreation_10Iter
+  //
+  // We create #iter thread pools, and within each of them run a loop of #per_iter steps.
+  std::atomic<std::ptrdiff_t> ctr{0};
+  constexpr std::ptrdiff_t per_iter = 1024;
+  constexpr int num_threads = 4;
+  for (auto i = 0; i < iter; i++) {
+    auto tp = onnxruntime::make_unique<ThreadPool>(&onnxruntime::Env::Default(),
+                                                   onnxruntime::ThreadOptions(),
+                                                   nullptr,
+                                                   num_threads,
+                                                   true);
+    tp->ParallelFor(per_iter, 0.0,
+                    [&](std::ptrdiff_t s, std::ptrdiff_t e) {
+                      ctr += e - s;
+                    });
+  }
+  ASSERT_EQ(ctr, iter * per_iter);
+}
+
 }  // namespace
 
 namespace onnxruntime {
@@ -253,6 +277,19 @@ TEST(ThreadPoolTest, TestBurstScheduling_65536Task) {
   // buffer tasks.
   TestBurstScheduling("TestBurstScheduling_65536Tasks", 65536);
 }
+
+TEST(ThreadPoolTest, TestPoolCreation_1Iter) {
+  TestPoolCreation("TestPoolCreation_1Iter", 1);
+}
+
+TEST(ThreadPoolTest, TestPoolCreation_10Iter) {
+  TestPoolCreation("TestPoolCreation_10Iter", 10);
+}
+
+TEST(ThreadPoolTest, TestPoolCreation_100Iter) {
+  TestPoolCreation("TestPoolCreation_100Iter", 100);
+}
+
 #ifdef _WIN32
 TEST(ThreadPoolTest, TestStackSize) {
   ThreadOptions to;