TensorRT EP - timing cache (#14767)

### Description This will enable a user to use a TensorRT timing cache based on #10297 to accelerate build times on a device with the same compute capability. This will work across models as it simply store kernel runtimes for specific configurations. Those files are usually very small (only a few MB) which makes them very easy to ship with an application to accelerate the build time on the user end. ### Motivation and Context Especially for workstation use cases TRT build times can be a roadblock. With a few model from ONNX model zoo i evaluated speedups when a timing cache is present. `./build/onnxruntime_perf_test -e tensorrt -I -t 5 -i "trt_timing_cache_enable|true" <onnx_path>` |Model | no Cache | with Cache| | ------------- | ------------- | ------------- | |efficientnet-lite4-11 | 34.6 s | 7.7 s| |yolov4 | 108.62 s | 9.4 s| To capture this is had to modify the onnxruntime_perf_test. The time is sometimes not captured within "Session creation time cost:" which is why i introduced "First inference time cost:". --------- Co-authored-by: Chi Lo <Chi.Lo@microsoft.com>
2026-07-21 19:18:55 +00:00 · 2023-03-10 18:02:27 +01:00 · 2023-03-10 18:02:27 +01:00 · ad4db12699
commit ad4db12699
parent acbb7ad453
15 changed files with 303 additions and 18 deletions
--- a/cmake/external/onnxruntime_external_deps.cmake
+++ b/cmake/external/onnxruntime_external_deps.cmake
@ -455,7 +455,7 @@ if (onnxruntime_USE_CUDA)
        list(APPEND onnxruntime_LINK_DIRS ${onnxruntime_CUDA_HOME}/x64/lib64)
      else()
        if(onnxruntime_CUDNN_HOME)
-          list(APPEND onnxruntime_LINK_DIRS ${onnxruntime_CUDNN_HOME}/lib64)
+          list(APPEND onnxruntime_LINK_DIRS  ${onnxruntime_CUDNN_HOME}/lib ${onnxruntime_CUDNN_HOME}/lib64)
        endif()
        list(APPEND onnxruntime_LINK_DIRS ${onnxruntime_CUDA_HOME}/lib64)
      endif()
--- a/include/onnxruntime/core/providers/tensorrt/tensorrt_provider_options.h
+++ b/include/onnxruntime/core/providers/tensorrt/tensorrt_provider_options.h
@ -5,7 +5,7 @@

 /// <summary>
 /// Options for the TensorRT provider that are passed to SessionOptionsAppendExecutionProvider_TensorRT_V2.
-/// Please note that this struct is *similar* to OrtTensorRTProviderOptions but only to be used internally. 
+/// Please note that this struct is *similar* to OrtTensorRTProviderOptions but only to be used internally.
 /// Going forward, new trt provider options are to be supported via this struct and usage of the publicly defined
 /// OrtTensorRTProviderOptions will be deprecated over time.
 /// User can only get the instance of OrtTensorRTProviderOptionsV2 via CreateTensorRTProviderOptions.
@ -31,4 +31,7 @@ struct OrtTensorRTProviderOptionsV2 {
  int trt_force_sequential_engine_build;        // force building TensorRT engine sequentially. Default 0 = false, nonzero = true
  int trt_context_memory_sharing_enable;        // enable context memory sharing between subgraphs. Default 0 = false, nonzero = true
  int trt_layer_norm_fp32_fallback;             // force Pow + Reduce ops in layer norm to FP32. Default 0 = false, nonzero = true
+  int trt_timing_cache_enable;                  // enable TensorRT timing cache. Default 0 = false, nonzero = true
+  int trt_force_timing_cache;                   // force the TensorRT cache to be used even if device profile does not match. Default 0 = false, nonzero = true
+  int trt_detailed_build_log;                   // Enable detailed build step logging on TensorRT EP with timing for each engine build. Default 0 = false, nonzero = true
 };
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
@ -117,6 +117,32 @@ bool SetDynamicRange(nvinfer1::INetworkDefinition& network, std::unordered_map<s
  }
  return true;
 }
+
+inline std::vector<char> loadTimingCacheFile(const std::string inFileName) {
+  std::ifstream iFile(inFileName, std::ios::in | std::ios::binary);
+  if (!iFile) {
+    LOGS_DEFAULT(WARNING) << "[TensorRT EP] Could not read timing cache from: " << inFileName
+                          << ". A new timing cache will be generated and written.";
+    return std::vector<char>();
+  }
+  iFile.seekg(0, std::ifstream::end);
+  size_t fsize = iFile.tellg();
+  iFile.seekg(0, std::ifstream::beg);
+  std::vector<char> content(fsize);
+  iFile.read(content.data(), fsize);
+  iFile.close();
+  return content;
+}
+
+inline void saveTimingCacheFile(const std::string outFileName, const nvinfer1::IHostMemory* blob) {
+  std::ofstream oFile(outFileName, std::ios::out | std::ios::binary);
+  if (!oFile) {
+    LOGS_DEFAULT(WARNING) << "[TensorRT EP] Could not write timing cache to: " << outFileName;
+    return;
+  }
+  oFile.write((char*)blob->data(), blob->size());
+  oFile.close();
+}
 }  // namespace

 namespace google {
@ -312,7 +338,10 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv
    }
    dump_subgraphs_ = info.dump_subgraphs;
    engine_cache_enable_ = info.engine_cache_enable;
-    if (engine_cache_enable_ || int8_enable_) {
+    timing_cache_enable_ = info.timing_cache_enable;
+    force_timing_cache_match_ = info.force_timing_cache;
+    detailed_build_log_ = info.detailed_build_log;
+    if (engine_cache_enable_ || int8_enable_ || timing_cache_enable_) {
      cache_path_ = info.engine_cache_path;
    }
    engine_decryption_enable_ = info.engine_decryption_enable;
@ -386,7 +415,22 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv
      engine_cache_enable_ = (std::stoi(engine_cache_enable_env) == 0 ? false : true);
    }

-    if (engine_cache_enable_ || int8_enable_) {
+    const std::string timing_cache_enable_env = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kTimingCacheEnable);
+    if (!timing_cache_enable_env.empty()) {
+      timing_cache_enable_ = (std::stoi(timing_cache_enable_env) == 0 ? false : true);
+    }
+
+    const std::string detailed_build_log_env = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kDetailedBuildLog);
+    if (!detailed_build_log_env.empty()) {
+      detailed_build_log_ = (std::stoi(detailed_build_log_env) == 0 ? false : true);
+    }
+
+    const std::string timing_force_match_env = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kForceTimingCache);
+    if (!timing_force_match_env.empty()) {
+      force_timing_cache_match_ = (std::stoi(timing_force_match_env) == 0 ? false : true);
+    }
+
+    if (engine_cache_enable_ || int8_enable_ || timing_cache_enable_) {
      const std::string engine_cache_path = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kEngineCachePath);
      cache_path_ = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kCachePath);
      if (!engine_cache_path.empty() && cache_path_.empty()) {
@ -438,7 +482,7 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv
    dla_core_ = 0;
  }

-  if (engine_cache_enable_ || int8_enable_) {
+  if (engine_cache_enable_ || int8_enable_ || timing_cache_enable_) {
    if (!cache_path_.empty() && !fs::is_directory(cache_path_)) {
      if (!fs::create_directory(cache_path_)) {
        throw std::runtime_error("Failed to create directory " + cache_path_);
@ -1373,6 +1417,12 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<FusedNodeAnd
    if (!has_dynamic_shape) {
      const std::string cache_path = GetCachePath(cache_path_, trt_node_name_with_precision);
      const std::string engine_cache_path = cache_path + ".engine";
+      std::string timing_cache_path = "";
+      if (timing_cache_enable_) {
+        cudaDeviceProp prop;
+        CUDA_CALL_THROW(cudaGetDeviceProperties(&prop, device_id_));
+        timing_cache_path = GetTimingCachePath(cache_path_, prop);
+      }
      {
        // ifstream file check, engine serialization/deserialization and engine build are in critical section. It needs lock protection to prevent race condition when inferencing with multithreading.
        auto lock = GetApiLock();
@ -1419,12 +1469,36 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<FusedNodeAnd
            }
          }

+          // Load timing cache from file. Create a fresh cache if the file doesn't exist
+          std::unique_ptr<nvinfer1::ITimingCache> timing_cache = nullptr;
+          if (timing_cache_enable_) {
+            std::vector<char> loaded_timing_cache = loadTimingCacheFile(timing_cache_path);
+            timing_cache.reset(trt_config->createTimingCache(static_cast<const void*>(loaded_timing_cache.data()), loaded_timing_cache.size()));
+            if (timing_cache == nullptr) {
+              return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                                     "TensorRT EP could not create timing cache: " + timing_cache_path);
+            }
+            trt_config->setTimingCache(*timing_cache, force_timing_cache_match_);
+            if (detailed_build_log_) {
+              LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Deserialized timing cache from " + timing_cache_path;
+            }
+          }
+
          // Build engine
+          std::chrono::steady_clock::time_point engine_build_start;
+          if (detailed_build_log_) {
+            engine_build_start = std::chrono::steady_clock::now();
+          }
          trt_engine = std::unique_ptr<nvinfer1::ICudaEngine>(trt_builder->buildEngineWithConfig(*trt_network, *trt_config));
          if (trt_engine == nullptr) {
            return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
                                   "TensorRT EP could not build engine for fused node: " + fused_node.Name());
          }
+          if (detailed_build_log_) {
+            auto engine_build_stop = std::chrono::steady_clock::now();
+            LOGS_DEFAULT(INFO) << "TensorRT engine build for " << trt_node_name_with_precision << " took: " <<
+              std::chrono::duration_cast<std::chrono::milliseconds>(engine_build_stop - engine_build_start).count() << "ms" << std::endl;
+          }
          if (engine_cache_enable_) {
            std::unique_ptr<nvinfer1::IHostMemory> serializedModel(trt_engine->serialize());
            size_t engine_size = serializedModel->size();
@ -1438,7 +1512,20 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<FusedNodeAnd
              std::ofstream file(engine_cache_path, std::ios::binary | std::ios::out);
              file.write(reinterpret_cast<char*>(serializedModel->data()), engine_size);
            }
-            LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Serialized " + engine_cache_path;
+            LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Serialized engine " + engine_cache_path;
+          }
+          // serialize and save timing cache
+          if (timing_cache_enable_) {
+            auto timing_cache = trt_config->getTimingCache();
+            std::unique_ptr<nvinfer1::IHostMemory> timingCacheHostData{timing_cache->serialize()};
+            if (timingCacheHostData == nullptr) {
+              return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                                     "TensorRT EP could not serialize timing cache: " + timing_cache_path);
+            }
+            saveTimingCacheFile(timing_cache_path, timingCacheHostData.get());
+            if (detailed_build_log_) {
+              LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Serialized timing cache " + timing_cache_path;
+            }
          }
        }
      }
@ -1504,7 +1591,8 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<FusedNodeAnd
            input_shape_ranges_[context->node_name], &tensorrt_mu_, fp16_enable_, int8_enable_, int8_calibration_cache_available_,
            dla_enable_, dla_core_, &max_workspace_size_, trt_node_name_with_precision, engine_cache_enable_, cache_path_,
            runtime_.get(), nullptr, allocator_, context_memory_sharing_enable_, &max_ctx_mem_size_, &context_memory_,
-            dynamic_range_map, engine_decryption_enable_, engine_decryption_, engine_encryption_};
+            dynamic_range_map, engine_decryption_enable_, engine_decryption_, engine_encryption_, timing_cache_enable_,
+            force_timing_cache_match_, detailed_build_log_};
      *state = p.release();
      return 0;
    };
@ -1545,6 +1633,12 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<FusedNodeAnd
      const std::string cache_path = GetCachePath(trt_state->engine_cache_path, trt_state->trt_node_name_with_precision);
      const std::string engine_cache_path = cache_path + ".engine";
      const std::string profile_cache_path = cache_path + ".profile";
+      std::string timing_cache_path = "";
+      if (timing_cache_enable_) {
+        cudaDeviceProp prop;
+        CUDA_CALL_THROW(cudaGetDeviceProperties(&prop, device_id_));
+        timing_cache_path = GetTimingCachePath(cache_path_, prop);
+      }
      if (trt_state->engine_cache_enable && trt_engine == nullptr) {
        std::ifstream engine_file(engine_cache_path, std::ios::binary | std::ios::in);
        std::ifstream profile_file(profile_cache_path, std::ios::binary | std::ios::in);
@ -1779,11 +1873,35 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<FusedNodeAnd
          trt_config->setDLACore(trt_state->dla_core);
        }

+        // Load timing cache from file. Create a fresh cache if the file doesn't exist
+        std::unique_ptr<nvinfer1::ITimingCache> timing_cache = nullptr;
+        if (trt_state->timing_cache_enable) {
+          std::vector<char> loaded_timing_cache = loadTimingCacheFile(timing_cache_path);
+          timing_cache.reset(trt_config->createTimingCache(static_cast<const void*>(loaded_timing_cache.data()), loaded_timing_cache.size()));
+          if (timing_cache == nullptr) {
+            return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                                   "TensorRT EP could not create timing cache: " + timing_cache_path);
+          }
+          trt_config->setTimingCache(*timing_cache, force_timing_cache_match_);
+          if (detailed_build_log_) {
+            LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Deserialized timing cache from " + timing_cache_path;
+          }
+        }
+
        // Build engine
        {
          auto lock = GetApiLock();
+          std::chrono::steady_clock::time_point engine_build_start;
+          if (detailed_build_log_) {
+            engine_build_start = std::chrono::steady_clock::now();
+          }
          *(trt_state->engine) = std::unique_ptr<nvinfer1::ICudaEngine>(
              trt_builder->buildEngineWithConfig(*trt_state->network->get(), *trt_config));
+          if (detailed_build_log_) {
+            auto engine_build_stop = std::chrono::steady_clock::now();
+            LOGS_DEFAULT(INFO) << "TensorRT engine build for " << trt_state->trt_node_name_with_precision << " took: " <<
+              std::chrono::duration_cast<std::chrono::milliseconds>(engine_build_stop - engine_build_start).count() << "ms" << std::endl;
+          }
        }
        if (trt_state->engine == nullptr) {
          return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "TensorRT EP Failed to Build Engine.");
@ -1809,6 +1927,20 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<FusedNodeAnd
          }
        }

+        // serialize and save timing cache
+        if (trt_state->timing_cache_enable) {
+          auto timing_cache = trt_config->getTimingCache();
+          std::unique_ptr<nvinfer1::IHostMemory> timingCacheHostData{timing_cache->serialize()};
+          if (timingCacheHostData == nullptr) {
+            return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                                   "TensorRT EP could not serialize timing cache: " + timing_cache_path);
+          }
+          saveTimingCacheFile(timing_cache_path, timingCacheHostData.get());
+          if (detailed_build_log_) {
+            LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Serialized timing cache " + timing_cache_path;
+          }
+        }
+
        // Build context
        if (trt_state->context_memory_sharing_enable) {
          *(trt_state->context) = std::unique_ptr<nvinfer1::IExecutionContext>(
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h
@ -30,6 +30,9 @@ static const std::string kDecryptionLibPath = "ORT_TENSORRT_ENGINE_DECRYPTION_LI
 static const std::string kForceSequentialEngineBuild = "ORT_TENSORRT_FORCE_SEQUENTIAL_ENGINE_BUILD";
 static const std::string kContextMemorySharingEnable = "ORT_TENSORRT_CONTEXT_MEMORY_SHARING_ENABLE";
 static const std::string kLayerNormFP32Fallback = "ORT_TENSORRT_LAYER_NORM_FP32_FALLBACK";
+static const std::string kTimingCacheEnable = "ORT_TENSORRT_TIMING_CACHE_ENABLE";
+static const std::string kForceTimingCache = "ORT_TENSORRT_FORCE_TIMING_CACHE_ENABLE";
+static const std::string kDetailedBuildLog = "ORT_TENSORRT_DETAILED_BUILD_LOG_ENABLE";
 // Old env variable for backward compatibility
 static const std::string kEngineCachePath = "ORT_TENSORRT_ENGINE_CACHE_PATH";
 }  // namespace tensorrt_env_vars
@ -114,6 +117,9 @@ struct TensorrtFuncState {
  bool engine_decryption_enable = false;
  int (*engine_decryption)(const char*, char*, size_t*) = nullptr;
  int (*engine_encryption)(const char*, char*, size_t) = nullptr;
+  bool timing_cache_enable = true;
+  bool force_timing_cache = false;
+  bool detailed_build_log = false;
 };

 // Logical device representation.
@ -176,6 +182,9 @@ class TensorrtExecutionProvider : public IExecutionProvider {
  bool engine_decryption_enable_ = false;
  int (*engine_decryption_)(const char*, char*, size_t*) = nullptr;
  int (*engine_encryption_)(const char*, char*, size_t) = nullptr;
+  bool timing_cache_enable_ = false;
+  bool force_timing_cache_match_ = false;
+  bool detailed_build_log_ = false;

  std::unordered_set<std::string> control_flow_op_set_ = {"If", "Loop", "Scan"};
  std::unordered_map<std::string, tensorrt_ptr::unique_pointer<nvonnxparser::IParser>> parsers_;
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.cc
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.cc
@ -27,11 +27,14 @@ constexpr const char* kCachePath = "trt_engine_cache_path";
 constexpr const char* kDecryptionEnable = "trt_engine_decryption_enable";
 constexpr const char* kDecryptionLibPath = "trt_engine_decryption_lib_path";
 constexpr const char* kForceSequentialEngineBuild = "trt_force_sequential_engine_build";
-// add new provider option name here. 
+// add new provider option name here.
 constexpr const char* kContextMemorySharingEnable = "trt_context_memory_sharing_enable";
 constexpr const char* kLayerNormFP32Fallback = "trt_layer_norm_fp32_fallback";
+constexpr const char* kTimingCacheEnable = "trt_timing_cache_enable";
+constexpr const char* kForceTimingCacheMatch = "trt_force_timing_cache_match";
+constexpr const char* kDetailedBuildLog = "trt_detailed_build_log";
 }  // namespace provider_option_names
-}  // namespace tensorrt 
+}  // namespace tensorrt

 TensorrtExecutionProviderInfo TensorrtExecutionProviderInfo::FromProviderOptions(const ProviderOptions& options) {
  TensorrtExecutionProviderInfo info{};
@ -57,15 +60,17 @@ TensorrtExecutionProviderInfo TensorrtExecutionProviderInfo::FromProviderOptions
          .AddAssignmentToReference(tensorrt::provider_option_names::kInt8CalibTable, info.int8_calibration_table_name)
          .AddAssignmentToReference(tensorrt::provider_option_names::kInt8UseNativeCalibTable, info.int8_use_native_calibration_table)
          .AddAssignmentToReference(tensorrt::provider_option_names::kDLAEnable, info.dla_enable)
-          .AddAssignmentToReference(tensorrt::provider_option_names::kDLACore, info.dla_core)		  
+          .AddAssignmentToReference(tensorrt::provider_option_names::kDLACore, info.dla_core)
          .AddAssignmentToReference(tensorrt::provider_option_names::kDumpSubgraphs, info.dump_subgraphs)
          .AddAssignmentToReference(tensorrt::provider_option_names::kEngineCacheEnable, info.engine_cache_enable)
          .AddAssignmentToReference(tensorrt::provider_option_names::kCachePath, info.engine_cache_path)
          .AddAssignmentToReference(tensorrt::provider_option_names::kDecryptionEnable, info.engine_decryption_enable)
-          .AddAssignmentToReference(tensorrt::provider_option_names::kDecryptionLibPath, info.engine_decryption_lib_path) 
+          .AddAssignmentToReference(tensorrt::provider_option_names::kDecryptionLibPath, info.engine_decryption_lib_path)
          .AddAssignmentToReference(tensorrt::provider_option_names::kForceSequentialEngineBuild, info.force_sequential_engine_build)
          .AddAssignmentToReference(tensorrt::provider_option_names::kContextMemorySharingEnable, info.context_memory_sharing_enable)
          .AddAssignmentToReference(tensorrt::provider_option_names::kLayerNormFP32Fallback, info.layer_norm_fp32_fallback)
+          .AddAssignmentToReference(tensorrt::provider_option_names::kTimingCacheEnable, info.timing_cache_enable)
+          .AddAssignmentToReference(tensorrt::provider_option_names::kForceTimingCacheMatch, info.force_timing_cache)
          .Parse(options)); // add new provider option here.

  return info;
@ -93,6 +98,7 @@ ProviderOptions TensorrtExecutionProviderInfo::ToProviderOptions(const TensorrtE
      // add new provider option here.
      {tensorrt::provider_option_names::kContextMemorySharingEnable, MakeStringWithClassicLocale(info.context_memory_sharing_enable)},
      {tensorrt::provider_option_names::kLayerNormFP32Fallback, MakeStringWithClassicLocale(info.layer_norm_fp32_fallback)},
+      {tensorrt::provider_option_names::kTimingCacheEnable, MakeStringWithClassicLocale(info.timing_cache_enable)},
  };
  return options;
 }
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.h
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.h
@ -17,10 +17,10 @@ struct TensorrtExecutionProviderInfo {
  void* user_compute_stream{nullptr};
  bool has_trt_options{false};
  int max_partition_iterations{1000};
-  int min_subgraph_size{1};  
+  int min_subgraph_size{1};
  size_t max_workspace_size{1 << 30};
  bool fp16_enable{false};
-  bool int8_enable{false}; 
+  bool int8_enable{false};
  std::string int8_calibration_table_name{""};
  bool int8_use_native_calibration_table{false};
  bool dla_enable{false};
@ -33,6 +33,9 @@ struct TensorrtExecutionProviderInfo {
  bool force_sequential_engine_build{false};
  bool context_memory_sharing_enable{false};
  bool layer_norm_fp32_fallback{false};
+  bool timing_cache_enable{false};
+  bool force_timing_cache{false};
+  bool detailed_build_log{false};

  static TensorrtExecutionProviderInfo FromProviderOptions(const ProviderOptions& options);
  static ProviderOptions ToProviderOptions(const TensorrtExecutionProviderInfo& info);
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_utils.h
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_utils.h
@ -168,6 +168,17 @@ std::string GetCachePath(const std::string& root, const std::string& name) {
  }
 }

+/*
+ * Get Timing by compute capability
+ *
+ */
+std::string GetTimingCachePath(const std::string& root, cudaDeviceProp prop) {
+  // append compute capability of the GPU as this invalidates the cache and TRT will throw when loading the cache
+  const std::string timing_cache_name = "TensorrtExecutionProvider_cache_cc"  +
+      std::to_string(prop.major * 10 + prop.minor) + ".timing";
+  return GetCachePath(root, timing_cache_name);
+}
+
 /*
 * Get cache by type
 *
--- a/onnxruntime/core/providers/tensorrt/tensorrt_provider_factory.cc
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_provider_factory.cc
@ -70,6 +70,8 @@ struct Tensorrt_Provider : Provider {
    info.force_sequential_engine_build = options.trt_force_sequential_engine_build != 0;
    info.context_memory_sharing_enable = options.trt_context_memory_sharing_enable != 0;
    info.layer_norm_fp32_fallback = options.trt_layer_norm_fp32_fallback != 0;
+    info.timing_cache_enable = options.trt_timing_cache_enable != 0;
+    info.detailed_build_log = options.trt_detailed_build_log != 0;
    return std::make_shared<TensorrtProviderFactory>(info);
  }

@ -137,6 +139,8 @@ struct Tensorrt_Provider : Provider {
    trt_options.trt_force_sequential_engine_build = internal_options.force_sequential_engine_build;
    trt_options.trt_context_memory_sharing_enable = internal_options.context_memory_sharing_enable;
    trt_options.trt_layer_norm_fp32_fallback = internal_options.layer_norm_fp32_fallback;
+    trt_options.trt_timing_cache_enable = internal_options.timing_cache_enable;
+    trt_options.trt_force_timing_cache = internal_options.force_timing_cache;
  }

  ProviderOptions GetProviderOptions(const void* provider_options) override {
--- a/onnxruntime/core/session/provider_bridge_ort.cc
+++ b/onnxruntime/core/session/provider_bridge_ort.cc
@ -1274,7 +1274,10 @@ OrtTensorRTProviderOptionsV2 OrtTensorRTProviderOptionsToOrtTensorRTProviderOpti
  trt_options_converted.trt_engine_decryption_lib_path = legacy_trt_options->trt_engine_decryption_lib_path;
  trt_options_converted.trt_force_sequential_engine_build = legacy_trt_options->trt_force_sequential_engine_build;
  // Add new provider option below
-  // Use default value as this field is not available in OrtTensorRTProviderOptionsV
+  // Use default value as this field is not available in OrtTensorRTProviderOptions
+  trt_options_converted.trt_timing_cache_enable = 0;
+  trt_options_converted.trt_force_timing_cache = 0;
+  trt_options_converted.trt_detailed_build_log = 0;
  trt_options_converted.trt_context_memory_sharing_enable = 0;
  trt_options_converted.trt_layer_norm_fp32_fallback = 0;
  return trt_options_converted;
@ -1601,6 +1604,9 @@ ORT_API_STATUS_IMPL(OrtApis::CreateTensorRTProviderOptions, _Outptr_ OrtTensorRT
  (*out)->trt_force_sequential_engine_build = false;
  (*out)->trt_context_memory_sharing_enable = false;
  (*out)->trt_layer_norm_fp32_fallback = false;
+  (*out)->trt_timing_cache_enable = false;
+  (*out)->trt_force_timing_cache = false;
+  (*out)->trt_detailed_build_log = false;
  return nullptr;
 #else
  ORT_UNUSED_PARAMETER(out);
--- a/onnxruntime/python/onnxruntime_pybind_state.cc
+++ b/onnxruntime/python/onnxruntime_pybind_state.cc
@ -368,6 +368,9 @@ std::unique_ptr<IExecutionProvider> CreateExecutionProviderInstance(
            nullptr,
            0,
            0,
+            0,
+            0,
+            0,
            0};
        for (auto option : it->second) {
          if (option.first == "device_id") {
--- a/onnxruntime/test/perftest/ort_test_session.cc
+++ b/onnxruntime/test/perftest/ort_test_session.cc
@ -121,6 +121,9 @@ OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env& env, std::random_device
    bool trt_force_sequential_engine_build = false;
    bool trt_context_memory_sharing_enable = false;
    bool trt_layer_norm_fp32_fallback = false;
+    bool trt_timing_cache_enable = false;
+    bool trt_force_timing_cache = false;
+    bool trt_detailed_build_log = false;

 #ifdef _MSC_VER
    std::string ov_string = ToUTF8String(performance_test_config.run_config.ep_runtime_config_string);
@ -268,6 +271,30 @@ OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env& env, std::random_device
        } else {
          ORT_THROW("[ERROR] [TensorRT] The value for the key 'trt_layer_norm_fp32_fallback' should be a boolean i.e. true or false. Default value is false.\n");
        }
+      } else if (key == "trt_timing_cache_enable") {
+        if (value == "true" || value == "True") {
+          trt_timing_cache_enable = true;
+        } else if (value == "false" || value == "False") {
+          trt_timing_cache_enable = false;
+        } else {
+          ORT_THROW("[ERROR] [TensorRT] The value for the key 'trt_timing_cache_enable' should be a boolean i.e. true or false. Default value is false.\n");
+        }
+      } else if (key == "trt_force_timing_cache") {
+        if (value == "true" || value == "True") {
+          trt_force_timing_cache = true;
+        } else if (value == "false" || value == "False") {
+          trt_force_timing_cache = false;
+        } else {
+          ORT_THROW("[ERROR] [TensorRT] The value for the key 'trt_force_timing_cache' should be a boolean i.e. true or false. Default value is false.\n");
+        }
+      } else if (key == "trt_detailed_build_log") {
+        if (value == "true" || value == "True") {
+          trt_detailed_build_log = true;
+        } else if (value == "false" || value == "False") {
+          trt_detailed_build_log = false;
+        } else {
+          ORT_THROW("[ERROR] [TensorRT] The value for the key 'trt_detailed_build_log' should be a boolean i.e. true or false. Default value is false.\n");
+        }
      } else {
        ORT_THROW("[ERROR] [TensorRT] wrong key type entered. Choose from the following runtime key options that are available for TensorRT. ['device_id', 'trt_max_partition_iterations', 'trt_min_subgraph_size', 'trt_max_workspace_size', 'trt_fp16_enable', 'trt_int8_enable', 'trt_int8_calibration_table_name', 'trt_int8_use_native_calibration_table', 'trt_dla_enable', 'trt_dla_core', 'trt_dump_subgraphs', 'trt_engine_cache_enable', 'trt_engine_cache_path', 'trt_engine_decryption_enable', 'trt_engine_decryption_lib_path', 'trt_force_sequential_engine_build', 'trt_context_memory_sharing_enable', 'trt_layer_norm_fp32_fallback'] \n");
      }
@ -293,6 +320,9 @@ OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env& env, std::random_device
    tensorrt_options.trt_force_sequential_engine_build = trt_force_sequential_engine_build;
    tensorrt_options.trt_context_memory_sharing_enable = trt_context_memory_sharing_enable;
    tensorrt_options.trt_layer_norm_fp32_fallback = trt_layer_norm_fp32_fallback;
+    tensorrt_options.trt_timing_cache_enable = trt_timing_cache_enable;
+    tensorrt_options.trt_force_timing_cache = trt_force_timing_cache;
+    tensorrt_options.trt_detailed_build_log = trt_detailed_build_log;
    session_options.AppendExecutionProvider_TensorRT_V2(tensorrt_options);

    OrtCUDAProviderOptions cuda_options;
--- a/onnxruntime/test/perftest/performance_runner.cc
+++ b/onnxruntime/test/perftest/performance_runner.cc
@ -114,7 +114,9 @@ Status PerformanceRunner::Run() {
  }

  // warm up
+  initial_inference_result_.start = std::chrono::high_resolution_clock::now();
  ORT_RETURN_IF_ERROR(RunOneIteration<true>());
+  initial_inference_result_.end = std::chrono::high_resolution_clock::now();

  // TODO: start profiling
  // if (!performance_test_config_.run_config.profile_file.empty())
@ -139,9 +141,12 @@ Status PerformanceRunner::Run() {
  std::chrono::duration<double> session_create_duration = session_create_end_ - session_create_start_;
  // TODO: end profiling
  // if (!performance_test_config_.run_config.profile_file.empty()) session_object->EndProfiling();
+  auto first_inference_duration =
+    std::chrono::duration_cast<std::chrono::milliseconds>(initial_inference_result_.end - initial_inference_result_.start).count();
  std::chrono::duration<double> inference_duration = performance_result_.end - performance_result_.start;

  std::cout << "Session creation time cost: " << session_create_duration.count() << " s\n"
+            << "First inference time cost: " << first_inference_duration << " ms\n"
            << "Total inference time cost: " << performance_result_.total_time_cost << " s\n"  // sum of time taken by each request
            << "Total inference requests: " << performance_result_.time_costs.size() << "\n"
            << "Average inference time cost: " << performance_result_.total_time_cost / performance_result_.time_costs.size() * 1000 << " ms\n"
--- a/onnxruntime/test/perftest/performance_runner.h
+++ b/onnxruntime/test/perftest/performance_runner.h
@ -106,6 +106,7 @@ class PerformanceRunner {
 private:
  std::chrono::time_point<std::chrono::high_resolution_clock> session_create_start_;
  std::chrono::time_point<std::chrono::high_resolution_clock> session_create_end_;
+  PerformanceResult initial_inference_result_;
  PerformanceResult performance_result_;
  PerformanceTestConfig performance_test_config_;
  std::unique_ptr<TestModelInfo> test_model_info_;
--- a/onnxruntime/test/providers/cpu/model_tests.cc
+++ b/onnxruntime/test/providers/cpu/model_tests.cc
@ -683,7 +683,7 @@ TEST_P(ModelTest, Run) {
        if (test_case_name.find(ORT_TSTR("FLOAT16")) != std::string::npos) {
          OrtTensorRTProviderOptionsV2 params{0, 0, nullptr, 1000, 1, 1 << 30,
                                              1,  // enable fp16
-                                              0, nullptr, 0, 0, 0, 0, 0, nullptr, 0, nullptr, 0, 0, 0};
+                                              0, nullptr, 0, 0, 0, 0, 0, nullptr, 0, nullptr, 0, 0, 0, 0, 0, 0};
          ortso.AppendExecutionProvider_TensorRT_V2(params);
        } else {
          OrtTensorRTProviderOptionsV2* ep_option = nullptr;
--- a/onnxruntime/test/providers/tensorrt/tensorrt_basic_test.cc
+++ b/onnxruntime/test/providers/tensorrt/tensorrt_basic_test.cc
@ -11,6 +11,8 @@
 #include "core/providers/tensorrt/tensorrt_execution_provider_utils.h"
 #include <string>
 #include <thread>
+#include <filesystem>
+#include <chrono>

 using namespace std;
 using namespace ONNX_NAMESPACE;
@ -151,6 +153,9 @@ void RunWithOneSessionSingleThreadInference(std::string model_name, std::string
      nullptr,
      0,
      0,
+      0,
+      0,
+      0,
      0};

    params.trt_engine_cache_enable = 1;
@ -222,6 +227,9 @@ void RunWithOneSessionMultiThreadsInference(std::string model_name, std::string
      nullptr,
      0,
      0,
+      0,
+      0,
+      0,
      0};

    params.trt_engine_cache_enable = 1;
@ -386,6 +394,9 @@ TEST_P(TensorrtExecutionProviderCacheTest, Run) {
      nullptr,
      0,
      0,
+      0,
+      0,
+      0,
      0};

  if (cache_type.compare("engine") == 0) {
@ -498,9 +509,68 @@ TEST_P(TensorrtExecutionProviderCacheTest, Run) {
    }
  } else if (cache_type.compare("timing") == 0) {
     // add test code here
+
+    /* Following code block tests the functionality of engine and optimization profile of ORT TRT, including:
+     * - timing cache cache serialization/de-serialization
+     * - benefir of usign a timing cache no matter if dynamic / static input
+     */
+    uint64_t compilation_without_cache_ms, compilation_with_cache_ms;
+
+    params.trt_timing_cache_enable = 1;
+    //  std::chrono
+    {
+      auto start = chrono::steady_clock::now();
+      std::unique_ptr<IExecutionProvider> execution_provider = TensorrtExecutionProviderWithOptions(&params);
+      EXPECT_TRUE(session_object.RegisterExecutionProvider(std::move(execution_provider)).IsOK());
+      auto status = session_object.Load(model_name);
+      ASSERT_TRUE(status.IsOK());
+      status = session_object.Initialize();
+      ASSERT_TRUE(status.IsOK());
+
+      // run inference
+      // TRT timing cache should be created under the situation of non-dynamic/dynamic shape input
+      status = session_object.Run(run_options, feeds, output_names, &fetches);
+      auto end = chrono::steady_clock::now();
+      ASSERT_TRUE(status.IsOK());
+      VerifyOutputs(fetches, expected_dims_mul_m, expected_values_mul_m);
+      ASSERT_TRUE(IsCacheExistedByType("./", ".timing"));
+      compilation_without_cache_ms = chrono::duration_cast<chrono::microseconds>(end - start).count();
+    }
+
+    // get new session and reinitialize model
+    // second same inference should resuse the cache and therefore have a faster build
+    if (input_type.compare("static") == 0) {
+      {
+        InferenceSession session_object_new{so, GetEnvironment()};
+        {
+          auto start = chrono::steady_clock::now();
+          std::unique_ptr<IExecutionProvider> execution_provider = TensorrtExecutionProviderWithOptions(&params);
+          EXPECT_TRUE(session_object_new.RegisterExecutionProvider(std::move(execution_provider)).IsOK());
+          auto status = session_object_new.Load(model_name);
+          ASSERT_TRUE(status.IsOK());
+          status = session_object_new.Initialize();
+          ASSERT_TRUE(status.IsOK());
+
+          // run inference
+          // TRT timing cache should be created under the situation of non-dynamic/dynamic shape input
+          status = session_object_new.Run(run_options, feeds, output_names, &fetches);
+          // TODO narrow down actual compilation section
+          auto end = chrono::steady_clock::now();
+
+          ASSERT_TRUE(status.IsOK());
+          VerifyOutputs(fetches, expected_dims_mul_m, expected_values_mul_m);
+          ASSERT_TRUE(IsCacheExistedByType("./", ".timing"));
+          compilation_with_cache_ms = chrono::duration_cast<chrono::microseconds>(end - start).count();
+        }
+      }
+      ASSERT_TRUE(compilation_with_cache_ms <= compilation_without_cache_ms);
+    } else {
+      // TODO test dynamic shapes
+    }
  }

  // clean up caches
+  RemoveCachesByType("./", ".timing");
  RemoveCachesByType("./", ".engine");
  RemoveCachesByType("./", ".profile");
 }
@ -515,11 +585,13 @@ TEST_P(TensorrtExecutionProviderCacheTest, Run) {
 * We have following test parameters:
 * - engine_static: engine cache enabled with non-dynamic input shape
 * - engine_dynamic: engine cache enabled with dynamic input shape
- * - timing_static: will be added
- * - timing_dynamic: will be added
+ * - timing_static: timing cache enabled, static input shape
+ * - timing_dynamic: timing cache enabled, static input shape
 */
 INSTANTIATE_TEST_SUITE_P(TensorrtExecutionProviderCacheTests, TensorrtExecutionProviderCacheTest, testing::Values("engine_static",
-                                                                                                                  "engine_dynamic"),
+                                                                                                                  "engine_dynamic",
+                                                                                                                  "timing_static",
+                                                                                                                  "timing_dynamic"),
                                                                                                  [](const ::testing::TestParamInfo<TensorrtExecutionProviderCacheTest::ParamType>& info) {return info.param;});

 TEST(TensorrtExecutionProviderTest, FunctionTest) {