mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-07-02 03:55:34 +00:00
Add compute capacity to trtep engine cache file (#16356)
### Description Add "_smXX" to trtep engine cache file name, which "sm" stands for "Streaming Multiprocessor". > The GPU compute capability version is prefixed with "SM" because NVIDIA typically improves and updates the SM in each new GPU architecture. ### Motivation and Context Github issue: https://github.com/microsoft/onnxruntime/issues/15982 Reduce the chance of misusing incompatible engine cache, when user is switching GPU devices with different compute capacity * The prevention can't be 100%, as model size & GPU memory size could be another factor to make cache incompatible
This commit is contained in:
parent
64b22cd00f
commit
9a80955c45
2 changed files with 28 additions and 10 deletions
|
|
@ -1929,15 +1929,20 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<FusedNodeAnd
|
|||
// Otherwise engine will be handled at inference time.
|
||||
std::unique_ptr<nvinfer1::ICudaEngine> trt_engine;
|
||||
std::unique_ptr<nvinfer1::IExecutionContext> trt_context;
|
||||
|
||||
// Name the engine cache based on GPU compute capacity and reduce the chance of loading an incompatible cache
|
||||
// Note: Engine cache generated on a GPU with large memory might not be loadable on a GPU with smaller memory, even if they share the same compute capacity
|
||||
cudaDeviceProp prop;
|
||||
CUDA_CALL_THROW(cudaGetDeviceProperties(&prop, device_id_));
|
||||
std::string compute_capability = GetComputeCapacity(prop);
|
||||
|
||||
if (!has_dynamic_shape) {
|
||||
const std::string cache_path = GetCachePath(cache_path_, trt_node_name_with_precision);
|
||||
const std::string engine_cache_path = cache_path + ".engine";
|
||||
const std::string profile_cache_path = cache_path + ".profile";
|
||||
const std::string engine_cache_path = cache_path + "_sm" + compute_capability + ".engine";
|
||||
const std::string profile_cache_path = cache_path + "_sm" + compute_capability + ".profile";
|
||||
std::string timing_cache_path = "";
|
||||
bool engine_update = false;
|
||||
if (timing_cache_enable_) {
|
||||
cudaDeviceProp prop;
|
||||
CUDA_CALL_THROW(cudaGetDeviceProperties(&prop, device_id_));
|
||||
timing_cache_path = GetTimingCachePath(cache_path_, prop);
|
||||
}
|
||||
{
|
||||
|
|
@ -2169,14 +2174,18 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<FusedNodeAnd
|
|||
Ort::ThrowOnError(api->KernelContext_GetGPUComputeStream(context, &cuda_stream));
|
||||
cudaStream_t stream = static_cast<cudaStream_t>(cuda_stream);
|
||||
|
||||
// Name the engine cache based on GPU compute capacity and reduce the chance of loading an incompatible cache
|
||||
// Note: Engine cache generated on a GPU with large memory might not be loadable on a GPU with smaller memory, even if they share the same compute capacity
|
||||
cudaDeviceProp prop;
|
||||
CUDA_CALL_THROW(cudaGetDeviceProperties(&prop, device_id_));
|
||||
std::string compute_capability = GetComputeCapacity(prop);
|
||||
|
||||
// Load serialized engine
|
||||
const std::string cache_path = GetCachePath(trt_state->engine_cache_path, trt_state->trt_node_name_with_precision);
|
||||
const std::string engine_cache_path = cache_path + ".engine";
|
||||
const std::string profile_cache_path = cache_path + ".profile";
|
||||
const std::string engine_cache_path = cache_path + "_sm" + compute_capability + ".engine";
|
||||
const std::string profile_cache_path = cache_path + "_sm" + compute_capability + ".profile";
|
||||
std::string timing_cache_path = "";
|
||||
if (timing_cache_enable_) {
|
||||
cudaDeviceProp prop;
|
||||
CUDA_CALL_THROW(cudaGetDeviceProperties(&prop, device_id_));
|
||||
timing_cache_path = GetTimingCachePath(cache_path_, prop);
|
||||
}
|
||||
if (trt_state->engine_cache_enable && trt_engine == nullptr) {
|
||||
|
|
|
|||
|
|
@ -443,14 +443,23 @@ std::string GetCachePath(const std::string& root, const std::string& name) {
|
|||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Get compute capability
|
||||
*
|
||||
*/
|
||||
std::string GetComputeCapacity(const cudaDeviceProp& prop) {
|
||||
const std::string compute_capability = std::to_string(prop.major * 10 + prop.minor);
|
||||
return compute_capability;
|
||||
}
|
||||
|
||||
/*
|
||||
* Get Timing by compute capability
|
||||
*
|
||||
*/
|
||||
std::string GetTimingCachePath(const std::string& root, cudaDeviceProp prop) {
|
||||
// append compute capability of the GPU as this invalidates the cache and TRT will throw when loading the cache
|
||||
const std::string timing_cache_name = "TensorrtExecutionProvider_cache_cc" +
|
||||
std::to_string(prop.major * 10 + prop.minor) + ".timing";
|
||||
const std::string timing_cache_name = "TensorrtExecutionProvider_cache_sm" +
|
||||
GetComputeCapacity(prop) + ".timing";
|
||||
return GetCachePath(root, timing_cache_name);
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Reference in a new issue