From 9a80955c4592e8629dd8f623e0a4d1bca7c80b33 Mon Sep 17 00:00:00 2001
From: yf711 <109183385+yf711@users.noreply.github.com>
Date: Tue, 20 Jun 2023 23:34:24 -0700
Subject: [PATCH] Add compute capacity to trtep engine cache file (#16356)

### Description

Add "_smXX" to trtep engine cache file name, which "sm" stands for
"Streaming Multiprocessor".

> The GPU compute capability version is prefixed with "SM" because
NVIDIA typically improves and updates the SM in each new GPU
architecture.

### Motivation and Context

Github issue: https://github.com/microsoft/onnxruntime/issues/15982

Reduce the chance of misusing incompatible engine cache, when user is
switching GPU devices with different compute capacity

* The prevention can't be 100%, as model size & GPU memory size could be
another factor to make cache incompatible
---
 .../tensorrt/tensorrt_execution_provider.cc   | 25 +++++++++++++------
 .../tensorrt_execution_provider_utils.h       | 13 ++++++++--
 2 files changed, 28 insertions(+), 10 deletions(-)
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
index 8af8eb327f..df591ed4e1 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
@@ -1929,15 +1929,20 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<FusedNodeAnd
     // Otherwise engine will be handled at inference time.
     std::unique_ptr<nvinfer1::ICudaEngine> trt_engine;
     std::unique_ptr<nvinfer1::IExecutionContext> trt_context;
+
+    // Name the engine cache based on GPU compute capacity and reduce the chance of loading an incompatible cache
+    // Note: Engine cache generated on a GPU with large memory might not be loadable on a GPU with smaller memory, even if they share the same compute capacity
+    cudaDeviceProp prop;
+    CUDA_CALL_THROW(cudaGetDeviceProperties(&prop, device_id_));
+    std::string compute_capability = GetComputeCapacity(prop);
+
     if (!has_dynamic_shape) {
       const std::string cache_path = GetCachePath(cache_path_, trt_node_name_with_precision);
-      const std::string engine_cache_path = cache_path + ".engine";
-      const std::string profile_cache_path = cache_path + ".profile";
+      const std::string engine_cache_path = cache_path + "_sm" + compute_capability + ".engine";
+      const std::string profile_cache_path = cache_path + "_sm" + compute_capability + ".profile";
       std::string timing_cache_path = "";
       bool engine_update = false;
       if (timing_cache_enable_) {
-        cudaDeviceProp prop;
-        CUDA_CALL_THROW(cudaGetDeviceProperties(&prop, device_id_));
         timing_cache_path = GetTimingCachePath(cache_path_, prop);
       }
       {
@@ -2169,14 +2174,18 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<FusedNodeAnd
       Ort::ThrowOnError(api->KernelContext_GetGPUComputeStream(context, &cuda_stream));
       cudaStream_t stream = static_cast<cudaStream_t>(cuda_stream);
 
+      // Name the engine cache based on GPU compute capacity and reduce the chance of loading an incompatible cache
+      // Note: Engine cache generated on a GPU with large memory might not be loadable on a GPU with smaller memory, even if they share the same compute capacity
+      cudaDeviceProp prop;
+      CUDA_CALL_THROW(cudaGetDeviceProperties(&prop, device_id_));
+      std::string compute_capability = GetComputeCapacity(prop);
+
       // Load serialized engine
       const std::string cache_path = GetCachePath(trt_state->engine_cache_path, trt_state->trt_node_name_with_precision);
-      const std::string engine_cache_path = cache_path + ".engine";
-      const std::string profile_cache_path = cache_path + ".profile";
+      const std::string engine_cache_path = cache_path + "_sm" + compute_capability + ".engine";
+      const std::string profile_cache_path = cache_path + "_sm" + compute_capability + ".profile";
       std::string timing_cache_path = "";
       if (timing_cache_enable_) {
-        cudaDeviceProp prop;
-        CUDA_CALL_THROW(cudaGetDeviceProperties(&prop, device_id_));
         timing_cache_path = GetTimingCachePath(cache_path_, prop);
       }
       if (trt_state->engine_cache_enable && trt_engine == nullptr) {
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_utils.h b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_utils.h
index adebfd9971..6bbeab7e94 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_utils.h
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_utils.h
@@ -443,14 +443,23 @@ std::string GetCachePath(const std::string& root, const std::string& name) {
   }
 }
 
+/*
+ * Get compute capability
+ *
+ */
+std::string GetComputeCapacity(const cudaDeviceProp& prop) {
+  const std::string compute_capability = std::to_string(prop.major * 10 + prop.minor);
+  return compute_capability;
+}
+
 /*
  * Get Timing by compute capability
  *
  */
 std::string GetTimingCachePath(const std::string& root, cudaDeviceProp prop) {
   // append compute capability of the GPU as this invalidates the cache and TRT will throw when loading the cache
-  const std::string timing_cache_name = "TensorrtExecutionProvider_cache_cc" +
-                                        std::to_string(prop.major * 10 + prop.minor) + ".timing";
+  const std::string timing_cache_name = "TensorrtExecutionProvider_cache_sm" +
+                                        GetComputeCapacity(prop) + ".timing";
   return GetCachePath(root, timing_cache_name);
 }