From 2b7e2a5bd07a882a1a1f16e81025a74745ef0394 Mon Sep 17 00:00:00 2001
From: Tianlei Wu <tlwu@microsoft.com>
Date: Tue, 23 Jul 2024 11:58:04 -0700
Subject: [PATCH] [CUDA] Fix cuda provider fallback inconsistency (#21425)

* Fix fallback setting (cuda still falls back to cuda).
* Fix cuda provider fallback inconsistent with/without CUDA_PATH
environment variable.
* Add cuda and cudnn major version requirement in error message.

Example result in Windows:
```
>>> import onnxruntime
>>> ort_session = onnxruntime.InferenceSession("model.onnx", providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])
2024-07-19 17:43:44.2260019 [E:onnxruntime:Default, provider_bridge_ort.cc:1972 onnxruntime::TryGetProviderInfo_CUDA] D:\onnxruntime\onnxruntime\core\session\provider_bridge_ort.cc:1636 onnxruntime::ProviderLibrary::Get [ONNXRuntimeError] : 1 : FAIL : LoadLibrary failed with error 126 "" when trying to load "C:\Users\.conda\envs\py310\lib\site-packages\onnxruntime\capi\onnxruntime_providers_cuda.dll"

2024-07-19 17:43:44.2312351 [W:onnxruntime:Default, onnxruntime_pybind_state.cc:970 onnxruntime::python::CreateExecutionProviderInstance] Failed to create CUDAExecutionProvider. Require cuDNN 9.* and CUDA 12.*, and the latest MSVC runtime. Please install all dependencies as mentioned in the GPU requirements page (https://onnxruntime.ai/docs/execution-providers/CUDA-ExecutionProvider.html#requirements), make sure they're in the PATH, and that your GPU is supported.
>>> ort_session
<onnxruntime.capi.onnxruntime_inference_collection.InferenceSession object at 0x0000016BB2DF7D60>
>>> ort_session.get_providers()
['CPUExecutionProvider']
```

Example result in Linux:
```
>>> import onnxruntime
>>> ort_session = onnxruntime.InferenceSession("resnet50-v2-7.onnx", providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])
2024-07-20 20:33:26.486974543 [E:onnxruntime:Default, provider_bridge_ort.cc:1972 TryGetProviderInfo_CUDA] /work/onnxruntime/onnxruntime/core/session/provider_bridge_ort.cc:1636 onnxruntime::Provider& onnxruntime::ProviderLibrary::Get() [ONNXRuntimeError] : 1 : FAIL : Failed to load library libonnxruntime_providers_cuda.so with error: libcublasLt.so.12: cannot open shared object file: No such file or directory

2024-07-20 20:33:26.487034646 [W:onnxruntime:Default, onnxruntime_pybind_state.cc:961 CreateExecutionProviderInstance] Failed to create CUDAExecutionProvider. Require cuDNN 9.* and CUDA 12.*. Please install all dependencies as mentioned in the GPU requirements page (https://onnxruntime.ai/docs/execution-providers/CUDA-ExecutionProvider.html#requirements), make sure they're in the PATH, and that your GPU is supported.
>>> ort_session.get_providers()
['CPUExecutionProvider']
```
### Motivation and Context
https://github.com/microsoft/onnxruntime/issues/21424
---
 cmake/onnxruntime_python.cmake                |  8 +++++--
 .../onnxruntime_inference_collection.py       | 16 +++++++++----
 .../python/onnxruntime_pybind_state.cc        | 24 ++++++++++---------
 3 files changed, 31 insertions(+), 17 deletions(-)

diff --git a/cmake/onnxruntime_python.cmake b/cmake/onnxruntime_python.cmake
index 07c65e7986..270139ceaf 100644
--- a/cmake/onnxruntime_python.cmake
+++ b/cmake/onnxruntime_python.cmake
@@ -97,8 +97,12 @@ endif()
 
 onnxruntime_add_include_to_target(onnxruntime_pybind11_state Python::Module Python::NumPy)
 target_include_directories(onnxruntime_pybind11_state PRIVATE ${ONNXRUNTIME_ROOT} ${pybind11_INCLUDE_DIRS})
-if(onnxruntime_USE_CUDA AND onnxruntime_CUDNN_HOME)
-    target_include_directories(onnxruntime_pybind11_state PRIVATE ${onnxruntime_CUDNN_HOME}/include)
+if(onnxruntime_USE_CUDA)
+    target_include_directories(onnxruntime_pybind11_state PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
+    # cudnn_home is optional for Window when cuda and cudnn are installed in the same directory.
+    if(onnxruntime_CUDNN_HOME)
+      target_include_directories(onnxruntime_pybind11_state PRIVATE ${onnxruntime_CUDNN_HOME}/include)
+    endif()
 endif()
 if(onnxruntime_USE_CANN)
     target_include_directories(onnxruntime_pybind11_state PRIVATE ${onnxruntime_CANN_HOME}/include)
diff --git a/onnxruntime/python/onnxruntime_inference_collection.py b/onnxruntime/python/onnxruntime_inference_collection.py
index ecae280e92..c3cfe2c97a 100644
--- a/onnxruntime/python/onnxruntime_inference_collection.py
+++ b/onnxruntime/python/onnxruntime_inference_collection.py
@@ -438,10 +438,18 @@ class InferenceSession(Session):
 
         # Tensorrt can fall back to CUDA if it's explicitly assigned. All others fall back to CPU.
         if "TensorrtExecutionProvider" in available_providers:
-            if providers and any(
-                provider == "CUDAExecutionProvider"
-                or (isinstance(provider, tuple) and provider[0] == "CUDAExecutionProvider")
-                for provider in providers
+            if (
+                providers
+                and any(
+                    provider == "CUDAExecutionProvider"
+                    or (isinstance(provider, tuple) and provider[0] == "CUDAExecutionProvider")
+                    for provider in providers
+                )
+                and any(
+                    provider == "TensorrtExecutionProvider"
+                    or (isinstance(provider, tuple) and provider[0] == "TensorrtExecutionProvider")
+                    for provider in providers
+                )
             ):
                 self._fallback_providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
             else:
diff --git a/onnxruntime/python/onnxruntime_pybind_state.cc b/onnxruntime/python/onnxruntime_pybind_state.cc
index d7155b2b68..6b5daf8cb8 100644
--- a/onnxruntime/python/onnxruntime_pybind_state.cc
+++ b/onnxruntime/python/onnxruntime_pybind_state.cc
@@ -35,6 +35,11 @@
 #include "contrib_ops/cpu/aten_ops/aten_op_executor.h"
 #endif
 
+#ifdef USE_CUDA
+#include <cuda.h>   // for CUDA_VERSION
+#include <cudnn.h>  // for CUDNN_MAJOR
+#endif
+
 #include <pybind11/functional.h>
 
 // Explicitly provide a definition for the static const var 'GPU' in the OrtDevice struct,
@@ -951,21 +956,18 @@ std::unique_ptr<IExecutionProvider> CreateExecutionProviderInstance(
         // external CUDA allocator.
         external_allocator_info = info.external_allocator_info;
         return cuda_provider_info->CreateExecutionProviderFactory(info)->CreateProvider();
-      } else {
-        if (!Env::Default().GetEnvironmentVar("CUDA_PATH").empty()) {
-          ORT_THROW(
-              "CUDA_PATH is set but CUDA wasnt able to be loaded. Please install the correct version of CUDA and"
-              "cuDNN as mentioned in the GPU requirements page "
-              " (https://onnxruntime.ai/docs/execution-providers/CUDA-ExecutionProvider.html#requirements), "
-              " make sure they're in the PATH, and that your GPU is supported.");
-        }
       }
     }
     LOGS_DEFAULT(WARNING) << "Failed to create "
                           << type
-                          << ". Please reference "
-                          << "https://onnxruntime.ai/docs/execution-providers/CUDA-ExecutionProvider.html#requirements"
-                          << "to ensure all dependencies are met.";
+                          << ". Require cuDNN " << CUDNN_MAJOR << ".* and "
+                          << "CUDA " << (CUDA_VERSION / 1000) << ".*"
+#if defined(_MSC_VER)
+                          << ", and the latest MSVC runtime"
+#endif
+                          << ". Please install all dependencies as mentioned in the GPU requirements page"
+                             " (https://onnxruntime.ai/docs/execution-providers/CUDA-ExecutionProvider.html#requirements), "
+                             "make sure they're in the PATH, and that your GPU is supported.";
 #endif
   } else if (type == kRocmExecutionProvider) {
 #ifdef USE_ROCM