From 9a80955c4592e8629dd8f623e0a4d1bca7c80b33 Mon Sep 17 00:00:00 2001 From: yf711 <109183385+yf711@users.noreply.github.com> Date: Tue, 20 Jun 2023 23:34:24 -0700 Subject: [PATCH] Add compute capacity to trtep engine cache file (#16356) ### Description Add "_smXX" to trtep engine cache file name, which "sm" stands for "Streaming Multiprocessor". > The GPU compute capability version is prefixed with "SM" because NVIDIA typically improves and updates the SM in each new GPU architecture. ### Motivation and Context Github issue: https://github.com/microsoft/onnxruntime/issues/15982 Reduce the chance of misusing incompatible engine cache, when user is switching GPU devices with different compute capacity * The prevention can't be 100%, as model size & GPU memory size could be another factor to make cache incompatible --- .../tensorrt/tensorrt_execution_provider.cc | 25 +++++++++++++------ .../tensorrt_execution_provider_utils.h | 13 ++++++++-- 2 files changed, 28 insertions(+), 10 deletions(-) diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc index 8af8eb327f..df591ed4e1 100644 --- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc +++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc @@ -1929,15 +1929,20 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector trt_engine; std::unique_ptr trt_context; + + // Name the engine cache based on GPU compute capacity and reduce the chance of loading an incompatible cache + // Note: Engine cache generated on a GPU with large memory might not be loadable on a GPU with smaller memory, even if they share the same compute capacity + cudaDeviceProp prop; + CUDA_CALL_THROW(cudaGetDeviceProperties(&prop, device_id_)); + std::string compute_capability = GetComputeCapacity(prop); + if (!has_dynamic_shape) { const std::string cache_path = GetCachePath(cache_path_, trt_node_name_with_precision); - const std::string engine_cache_path = cache_path + ".engine"; - const std::string profile_cache_path = cache_path + ".profile"; + const std::string engine_cache_path = cache_path + "_sm" + compute_capability + ".engine"; + const std::string profile_cache_path = cache_path + "_sm" + compute_capability + ".profile"; std::string timing_cache_path = ""; bool engine_update = false; if (timing_cache_enable_) { - cudaDeviceProp prop; - CUDA_CALL_THROW(cudaGetDeviceProperties(&prop, device_id_)); timing_cache_path = GetTimingCachePath(cache_path_, prop); } { @@ -2169,14 +2174,18 @@ common::Status TensorrtExecutionProvider::Compile(const std::vectorKernelContext_GetGPUComputeStream(context, &cuda_stream)); cudaStream_t stream = static_cast(cuda_stream); + // Name the engine cache based on GPU compute capacity and reduce the chance of loading an incompatible cache + // Note: Engine cache generated on a GPU with large memory might not be loadable on a GPU with smaller memory, even if they share the same compute capacity + cudaDeviceProp prop; + CUDA_CALL_THROW(cudaGetDeviceProperties(&prop, device_id_)); + std::string compute_capability = GetComputeCapacity(prop); + // Load serialized engine const std::string cache_path = GetCachePath(trt_state->engine_cache_path, trt_state->trt_node_name_with_precision); - const std::string engine_cache_path = cache_path + ".engine"; - const std::string profile_cache_path = cache_path + ".profile"; + const std::string engine_cache_path = cache_path + "_sm" + compute_capability + ".engine"; + const std::string profile_cache_path = cache_path + "_sm" + compute_capability + ".profile"; std::string timing_cache_path = ""; if (timing_cache_enable_) { - cudaDeviceProp prop; - CUDA_CALL_THROW(cudaGetDeviceProperties(&prop, device_id_)); timing_cache_path = GetTimingCachePath(cache_path_, prop); } if (trt_state->engine_cache_enable && trt_engine == nullptr) { diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_utils.h b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_utils.h index adebfd9971..6bbeab7e94 100644 --- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_utils.h +++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_utils.h @@ -443,14 +443,23 @@ std::string GetCachePath(const std::string& root, const std::string& name) { } } +/* + * Get compute capability + * + */ +std::string GetComputeCapacity(const cudaDeviceProp& prop) { + const std::string compute_capability = std::to_string(prop.major * 10 + prop.minor); + return compute_capability; +} + /* * Get Timing by compute capability * */ std::string GetTimingCachePath(const std::string& root, cudaDeviceProp prop) { // append compute capability of the GPU as this invalidates the cache and TRT will throw when loading the cache - const std::string timing_cache_name = "TensorrtExecutionProvider_cache_cc" + - std::to_string(prop.major * 10 + prop.minor) + ".timing"; + const std::string timing_cache_name = "TensorrtExecutionProvider_cache_sm" + + GetComputeCapacity(prop) + ".timing"; return GetCachePath(root, timing_cache_name); }