From bc219ed553fc8d4b8fa3c7b4476810a63a864d8b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maximilian=20M=C3=BCller?= <44298237+gedoensmax@users.noreply.github.com> Date: Wed, 17 Jan 2024 20:33:34 +0100 Subject: [PATCH] [TensorRT EP] Enable a minimal CUDA EP compilation without kernels (#19052) Adresses https://github.com/microsoft/onnxruntime/issues/18542. I followed the advice given by @RyanUnderhill [here](https://github.com/microsoft/onnxruntime/pull/18731#issuecomment-1848261925) and went with a minimal CUDA EP for now. --- cmake/CMakeLists.txt | 1 + cmake/onnxruntime_providers_cuda.cmake | 49 ++++++++++++++----- .../core/providers/cuda/cuda_context.h | 3 +- onnxruntime/core/providers/cuda/cuda_call.cc | 4 ++ .../core/providers/cuda/cuda_common.cc | 42 ++++++++-------- onnxruntime/core/providers/cuda/cuda_common.h | 6 ++- .../providers/cuda/cuda_execution_provider.cc | 14 +++++- onnxruntime/core/providers/cuda/cuda_pch.h | 7 +++ .../core/providers/cuda/cuda_stream_handle.cc | 4 ++ .../core/providers/cuda/cudnn_common.cc | 3 +- .../core/providers/cuda/cudnn_common.h | 3 +- 11 files changed, 97 insertions(+), 39 deletions(-) diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt index bc96218dac..712d5d7610 100644 --- a/cmake/CMakeLists.txt +++ b/cmake/CMakeLists.txt @@ -79,6 +79,7 @@ option(onnxruntime_USE_CUDA "Build with CUDA support" OFF) cmake_dependent_option(onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS "Build with CUDA unit tests" OFF "onnxruntime_USE_CUDA;onnxruntime_BUILD_UNIT_TESTS;LINUX" OFF) option(onnxruntime_USE_CUDA_NHWC_OPS "Build CUDA with NHWC op support" OFF) +option(onnxruntime_CUDA_MINIMAL "Build CUDA without any operations apart from memcpy ops. Usefuel for a very minial TRT build" OFF) option(onnxruntime_ENABLE_CUDA_LINE_NUMBER_INFO "When building with CUDA support, generate device code line number information." OFF) option(onnxruntime_USE_OPENVINO "Build with OpenVINO support" OFF) option(onnxruntime_USE_COREML "Build with CoreML support" OFF) diff --git a/cmake/onnxruntime_providers_cuda.cmake b/cmake/onnxruntime_providers_cuda.cmake index 84d1376f99..9887d615c9 100644 --- a/cmake/onnxruntime_providers_cuda.cmake +++ b/cmake/onnxruntime_providers_cuda.cmake @@ -1,10 +1,25 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. - file(GLOB_RECURSE onnxruntime_providers_cuda_cc_srcs CONFIGURE_DEPENDS - "${ONNXRUNTIME_ROOT}/core/providers/cuda/*.h" - "${ONNXRUNTIME_ROOT}/core/providers/cuda/*.cc" - ) + + if (onnxruntime_CUDA_MINIMAL) + file(GLOB onnxruntime_providers_cuda_cc_srcs CONFIGURE_DEPENDS + "${ONNXRUNTIME_ROOT}/core/providers/cuda/*.h" + "${ONNXRUNTIME_ROOT}/core/providers/cuda/*.cc" + "${ONNXRUNTIME_ROOT}/core/providers/cuda/tunable/*.h" + "${ONNXRUNTIME_ROOT}/core/providers/cuda/tunable/*.cc" + ) + # Remove pch files + list(REMOVE_ITEM onnxruntime_providers_cuda_cc_srcs + "${ONNXRUNTIME_ROOT}/core/providers/cuda/integer_gemm.cc" + "${ONNXRUNTIME_ROOT}/core/providers/cuda/triton_kernel.h" + ) + else() + file(GLOB_RECURSE onnxruntime_providers_cuda_cc_srcs CONFIGURE_DEPENDS + "${ONNXRUNTIME_ROOT}/core/providers/cuda/*.h" + "${ONNXRUNTIME_ROOT}/core/providers/cuda/*.cc" + ) + endif() # Remove pch files list(REMOVE_ITEM onnxruntime_providers_cuda_cc_srcs "${ONNXRUNTIME_ROOT}/core/providers/cuda/cuda_pch.h" @@ -16,11 +31,16 @@ "${ONNXRUNTIME_ROOT}/core/providers/shared_library/*.h" "${ONNXRUNTIME_ROOT}/core/providers/shared_library/*.cc" ) - file(GLOB_RECURSE onnxruntime_providers_cuda_cu_srcs CONFIGURE_DEPENDS - "${ONNXRUNTIME_ROOT}/core/providers/cuda/*.cu" - "${ONNXRUNTIME_ROOT}/core/providers/cuda/*.cuh" - ) + + if (onnxruntime_CUDA_MINIMAL) + set(onnxruntime_providers_cuda_shared_srcs "") + else() + file(GLOB_RECURSE onnxruntime_providers_cuda_cu_srcs CONFIGURE_DEPENDS + "${ONNXRUNTIME_ROOT}/core/providers/cuda/*.cu" + "${ONNXRUNTIME_ROOT}/core/providers/cuda/*.cuh" + ) + endif() source_group(TREE ${ONNXRUNTIME_ROOT}/core FILES ${onnxruntime_providers_cuda_cc_srcs} ${onnxruntime_providers_cuda_shared_srcs} ${onnxruntime_providers_cuda_cu_srcs}) set(onnxruntime_providers_cuda_src ${onnxruntime_providers_cuda_cc_srcs} ${onnxruntime_providers_cuda_shared_srcs} ${onnxruntime_providers_cuda_cu_srcs}) @@ -156,10 +176,15 @@ endif() add_dependencies(${target} onnxruntime_providers_shared ${onnxruntime_EXTERNAL_DEPENDENCIES}) - target_link_libraries(${target} PRIVATE cublasLt cublas cudnn curand cufft ${ABSEIL_LIBS} ${ONNXRUNTIME_PROVIDERS_SHARED} Boost::mp11 safeint_interface) - if(onnxruntime_CUDNN_HOME) - target_include_directories(${target} PRIVATE ${onnxruntime_CUDNN_HOME}/include) - target_link_directories(${target} PRIVATE ${onnxruntime_CUDNN_HOME}/lib) + if(onnxruntime_CUDA_MINIMAL) + target_compile_definitions(${target} PRIVATE USE_CUDA_MINIMAL) + target_link_libraries(${target} PRIVATE ${ABSEIL_LIBS} ${ONNXRUNTIME_PROVIDERS_SHARED} Boost::mp11 safeint_interface) + else() + target_link_libraries(${target} PRIVATE cublasLt cublas cudnn curand cufft ${ABSEIL_LIBS} ${ONNXRUNTIME_PROVIDERS_SHARED} Boost::mp11 safeint_interface) + if(onnxruntime_CUDNN_HOME) + target_include_directories(${target} PRIVATE ${onnxruntime_CUDNN_HOME}/include) + target_link_directories(${target} PRIVATE ${onnxruntime_CUDNN_HOME}/lib) + endif() endif() if (onnxruntime_USE_TRITON_KERNEL) diff --git a/include/onnxruntime/core/providers/cuda/cuda_context.h b/include/onnxruntime/core/providers/cuda/cuda_context.h index 9416fad5f1..1370f5c4c5 100644 --- a/include/onnxruntime/core/providers/cuda/cuda_context.h +++ b/include/onnxruntime/core/providers/cuda/cuda_context.h @@ -16,9 +16,10 @@ #include "core/providers/custom_op_context.h" #include #include +#ifndef USE_CUDA_MINIMAL #include #include - +#endif namespace Ort { namespace Custom { diff --git a/onnxruntime/core/providers/cuda/cuda_call.cc b/onnxruntime/core/providers/cuda/cuda_call.cc index 4f223041e0..f60684795a 100644 --- a/onnxruntime/core/providers/cuda/cuda_call.cc +++ b/onnxruntime/core/providers/cuda/cuda_call.cc @@ -30,6 +30,7 @@ const char* CudaErrString(cudaError_t x) { return cudaGetErrorString(x); } +#ifndef USE_CUDA_MINIMAL template <> const char* CudaErrString(cublasStatus_t e) { cudaDeviceSynchronize(); @@ -76,6 +77,7 @@ const char* CudaErrString(cufftResult e) { return "Unknown cufft error status"; } } +#endif #ifdef ORT_USE_NCCL template <> @@ -132,6 +134,7 @@ std::conditional_t CudaCall( template Status CudaCall(cudaError retCode, const char* exprString, const char* libName, cudaError successCode, const char* msg, const char* file, const int line); template void CudaCall(cudaError retCode, const char* exprString, const char* libName, cudaError successCode, const char* msg, const char* file, const int line); +#ifndef USE_CUDA_MINIMAL template Status CudaCall(cublasStatus_t retCode, const char* exprString, const char* libName, cublasStatus_t successCode, const char* msg, const char* file, const int line); template void CudaCall(cublasStatus_t retCode, const char* exprString, const char* libName, cublasStatus_t successCode, const char* msg, const char* file, const int line); template Status CudaCall(cudnnStatus_t retCode, const char* exprString, const char* libName, cudnnStatus_t successCode, const char* msg, const char* file, const int line); @@ -140,6 +143,7 @@ template Status CudaCall(curandStatus_t retCode, const ch template void CudaCall(curandStatus_t retCode, const char* exprString, const char* libName, curandStatus_t successCode, const char* msg, const char* file, const int line); template Status CudaCall(cufftResult retCode, const char* exprString, const char* libName, cufftResult successCode, const char* msg, const char* file, const int line); template void CudaCall(cufftResult retCode, const char* exprString, const char* libName, cufftResult successCode, const char* msg, const char* file, const int line); +#endif #ifdef ORT_USE_NCCL template Status CudaCall(ncclResult_t retCode, const char* exprString, const char* libName, ncclResult_t successCode, const char* msg, const char* file, const int line); diff --git a/onnxruntime/core/providers/cuda/cuda_common.cc b/onnxruntime/core/providers/cuda/cuda_common.cc index 33f2938940..65083f89f7 100644 --- a/onnxruntime/core/providers/cuda/cuda_common.cc +++ b/onnxruntime/core/providers/cuda/cuda_common.cc @@ -14,6 +14,27 @@ namespace cuda { // 0x04 - pedantic constexpr const char* kCudaGemmOptions = "ORT_CUDA_GEMM_OPTIONS"; +const char* CudaDataTypeToString(cudaDataType_t dt) { + switch (dt) { + case CUDA_R_16F: + return "CUDA_R_16F"; + case CUDA_R_16BF: + return "CUDA_R_16BF"; + case CUDA_R_32F: + return "CUDA_R_32F"; +#if !defined(DISABLE_FLOAT8_TYPES) + // Note: CUDA_R_8F_E4M3 is defined with CUDA>=11.8 + case CUDA_R_8F_E4M3: + return "CUDA_R_8F_E4M3"; + case CUDA_R_8F_E5M2: + return "CUDA_R_8F_E5M2"; +#endif + default: + return ""; + } +} + +#ifndef USE_CUDA_MINIMAL // Initialize the singleton instance HalfGemmOptions HalfGemmOptions::instance; @@ -54,26 +75,6 @@ const char* cublasGetErrorEnum(cublasStatus_t error) { } } -const char* CudaDataTypeToString(cudaDataType_t dt) { - switch (dt) { - case CUDA_R_16F: - return "CUDA_R_16F"; - case CUDA_R_16BF: - return "CUDA_R_16BF"; - case CUDA_R_32F: - return "CUDA_R_32F"; -#if !defined(DISABLE_FLOAT8_TYPES) - // Note: CUDA_R_8F_E4M3 is defined with CUDA>=11.8 - case CUDA_R_8F_E4M3: - return "CUDA_R_8F_E4M3"; - case CUDA_R_8F_E5M2: - return "CUDA_R_8F_E5M2"; -#endif - default: - return ""; - } -} - const char* CublasComputeTypeToString(cublasComputeType_t ct) { switch (ct) { case CUBLAS_COMPUTE_16F: @@ -92,6 +93,7 @@ const char* CublasComputeTypeToString(cublasComputeType_t ct) { return ""; } } +#endif // It must exist somewhere already. cudaDataType_t ToCudaDataType(int32_t element_type) { diff --git a/onnxruntime/core/providers/cuda/cuda_common.h b/onnxruntime/core/providers/cuda/cuda_common.h index 707099bac3..e9941ce743 100644 --- a/onnxruntime/core/providers/cuda/cuda_common.h +++ b/onnxruntime/core/providers/cuda/cuda_common.h @@ -22,13 +22,14 @@ namespace onnxruntime { namespace cuda { #define CUDA_RETURN_IF_ERROR(expr) ORT_RETURN_IF_ERROR(CUDA_CALL(expr)) +#ifndef USE_CUDA_MINIMAL #define CUBLAS_RETURN_IF_ERROR(expr) ORT_RETURN_IF_ERROR(CUBLAS_CALL(expr)) #define CUSPARSE_RETURN_IF_ERROR(expr) ORT_RETURN_IF_ERROR(CUSPARSE_CALL(expr)) #define CURAND_RETURN_IF_ERROR(expr) ORT_RETURN_IF_ERROR(CURAND_CALL(expr)) #define CUDNN_RETURN_IF_ERROR(expr) ORT_RETURN_IF_ERROR(CUDNN_CALL(expr)) #define CUDNN2_RETURN_IF_ERROR(expr, m) ORT_RETURN_IF_ERROR(CUDNN_CALL2(expr, m)) #define CUFFT_RETURN_IF_ERROR(expr) ORT_RETURN_IF_ERROR(CUFFT_CALL(expr)) - +#endif // Type mapping for MLFloat16 to half template class ToCudaType { @@ -93,7 +94,7 @@ inline bool CalculateFdmStrides(gsl::span p, const std::vector KernelCreateInfo BuildKernelCreateInfo() { @@ -1326,6 +1332,7 @@ static Status RegisterCudaKernels(KernelRegistry& kernel_registry) { BuildKernelCreateInfo, // default entry to avoid the list become empty after ops-reducing BuildKernelCreateInfo, BuildKernelCreateInfo, +#ifndef USE_CUDA_MINIMAL BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, @@ -2201,6 +2208,7 @@ static Status RegisterCudaKernels(KernelRegistry& kernel_registry) { BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, +#endif }; for (auto& function_table_entry : function_table) { @@ -2210,6 +2218,7 @@ static Status RegisterCudaKernels(KernelRegistry& kernel_registry) { } } +#ifndef USE_CUDA_MINIMAL #ifndef DISABLE_CONTRIB_OPS ORT_RETURN_IF_ERROR(::onnxruntime::contrib::cuda::RegisterCudaContribKernels(kernel_registry)); #endif @@ -2220,6 +2229,7 @@ static Status RegisterCudaKernels(KernelRegistry& kernel_registry) { #ifdef ENABLE_TRAINING_OPS ORT_RETURN_IF_ERROR(::onnxruntime::cuda::RegisterCudaTrainingKernels(kernel_registry)); +#endif #endif return Status::OK(); diff --git a/onnxruntime/core/providers/cuda/cuda_pch.h b/onnxruntime/core/providers/cuda/cuda_pch.h index f48554e8f1..dfe50fe0a8 100644 --- a/onnxruntime/core/providers/cuda/cuda_pch.h +++ b/onnxruntime/core/providers/cuda/cuda_pch.h @@ -10,12 +10,19 @@ #include #include +#include +#ifndef USE_CUDA_MINIMAL #include #include #include #include #include #include +#else +typedef void* cudnnHandle_t; +typedef void* cublasHandle_t; +typedef void* cublasLtHandle_t; +#endif #ifdef ORT_USE_NCCL #include diff --git a/onnxruntime/core/providers/cuda/cuda_stream_handle.cc b/onnxruntime/core/providers/cuda/cuda_stream_handle.cc index 7c866395ec..0a256394b7 100644 --- a/onnxruntime/core/providers/cuda/cuda_stream_handle.cc +++ b/onnxruntime/core/providers/cuda/cuda_stream_handle.cc @@ -69,6 +69,7 @@ CudaStream::CudaStream(cudaStream_t stream, release_cpu_buffer_on_cuda_stream_(release_cpu_buffer_on_cuda_stream), deferred_cpu_allocator_(*this), ep_info_(ep_info) { +#ifndef USE_CUDA_MINIMAL if (own_flag) { CUBLAS_CALL_THROW(cublasCreate(&cublas_handle_)); CUBLAS_CALL_THROW(cublasSetStream(cublas_handle_, stream)); @@ -80,10 +81,12 @@ CudaStream::CudaStream(cudaStream_t stream, cudnn_handle_ = external_cudnn_handle; CUDNN_CALL_THROW(cudnnSetStream(cudnn_handle_, stream)); } +#endif } CudaStream::~CudaStream() { ORT_IGNORE_RETURN_VALUE(CleanUpOnRunEnd()); +#ifndef USE_CUDA_MINIMAL if (own_stream_) { cublasDestroy(cublas_handle_); cudnnDestroy(cudnn_handle_); @@ -91,6 +94,7 @@ CudaStream::~CudaStream() { if (handle) cudaStreamDestroy(static_cast(handle)); } +#endif } std::unique_ptr CudaStream::CreateNotification(size_t /*num_consumers*/) { diff --git a/onnxruntime/core/providers/cuda/cudnn_common.cc b/onnxruntime/core/providers/cuda/cudnn_common.cc index 4df59a98b1..c850f7b583 100644 --- a/onnxruntime/core/providers/cuda/cudnn_common.cc +++ b/onnxruntime/core/providers/cuda/cudnn_common.cc @@ -9,7 +9,7 @@ #include "core/common/gsl.h" #include "shared_inc/cuda_call.h" #include "core/providers/cpu/tensor/utils.h" - +#ifndef USE_CUDA_MINIMAL namespace onnxruntime { namespace cuda { @@ -222,3 +222,4 @@ const Float8E5M2 Consts::One = Float8E5M2(1.0f, true); } // namespace cuda } // namespace onnxruntime +#endif diff --git a/onnxruntime/core/providers/cuda/cudnn_common.h b/onnxruntime/core/providers/cuda/cudnn_common.h index 8a94a334ee..fdd14dedad 100644 --- a/onnxruntime/core/providers/cuda/cudnn_common.h +++ b/onnxruntime/core/providers/cuda/cudnn_common.h @@ -7,7 +7,7 @@ #include #include "core/providers/cuda/cuda_common.h" - +#ifndef USE_CUDA_MINIMAL namespace onnxruntime { namespace cuda { @@ -260,3 +260,4 @@ SetPoolingNdDescriptorHelper(cudnnPoolingDescriptor_t poolingDesc, } // namespace cuda } // namespace onnxruntime +#endif