From ac7538b909990f89f9bfebb325cd53c3d31cf158 Mon Sep 17 00:00:00 2001 From: Changming Sun Date: Wed, 10 Aug 2022 22:46:41 -0700 Subject: [PATCH] Remove CUDA 10.2 support (#12541) --- cmake/CMakeLists.txt | 13 ++-- include/onnxruntime/core/common/common.h | 6 -- .../onnxruntime/core/common/parse_string.h | 2 +- .../onnxruntime/core/framework/allocator.h | 2 +- .../core/framework/data_types_internal.h | 4 +- include/onnxruntime/core/framework/float16.h | 4 +- onnxruntime/core/framework/endian_utils.cc | 2 +- .../core/framework/tensorprotoutils.cc | 4 +- .../cpu/ml/tree_ensemble_classifier.cc | 2 +- .../core/providers/cpu/ml/treeregressor.cc | 2 +- onnxruntime/core/providers/cuda/nn/conv.cc | 2 +- .../providers/cuda/reduction/reduction_ops.cc | 4 +- .../providers/rocm/reduction/reduction_ops.cc | 4 +- .../test/framework/sparse_kernels_test.cc | 2 +- .../linux-gpu-cuda-11-pipeline.yml | 59 ------------------- .../win-gpu-reduce-op-ci-pipeline.yml | 4 +- .../github/windows/setup_env_cuda.bat | 2 - 17 files changed, 23 insertions(+), 95 deletions(-) delete mode 100644 tools/ci_build/github/windows/setup_env_cuda.bat diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt index 2395641eb5..ac2277eb19 100644 --- a/cmake/CMakeLists.txt +++ b/cmake/CMakeLists.txt @@ -1245,6 +1245,9 @@ endif() function(onnxruntime_set_compile_flags target_name) target_compile_definitions(${target_name} PUBLIC EIGEN_USE_THREADS) if (MSVC) + foreach(CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORY ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) + target_compile_options(${target_name} PRIVATE "$<$>:/external:I${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORY}>") + endforeach() target_compile_definitions(${target_name} PUBLIC -DPLATFORM_WINDOWS -DNOGDI -DNOMINMAX -D_USE_MATH_DEFINES -D_SILENCE_ALL_CXX17_DEPRECATION_WARNINGS) if (onnxruntime_ENABLE_MEMLEAK_CHECKER) target_compile_definitions(${target_name} PUBLIC -DONNXRUNTIME_ENABLE_MEMLEAK_CHECK) @@ -1794,15 +1797,7 @@ if (onnxruntime_USE_CUDA) set(CMAKE_CUDA_RUNTIME_LIBRARY Shared) enable_language(CUDA) message( STATUS "CMAKE_CUDA_COMPILER_VERSION: ${CMAKE_CUDA_COMPILER_VERSION}") - if (WIN32) - set(CMAKE_CUDA_STANDARD 17) - foreach(CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORY ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) - string(APPEND CMAKE_CXX_FLAGS " /external:I\"${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORY}\"") - endforeach() - else() - #CUDA 10.2 on Linux doesn't support C++17 - set(CMAKE_CUDA_STANDARD 14) - endif() + set(CMAKE_CUDA_STANDARD 17) file(TO_CMAKE_PATH ${onnxruntime_CUDNN_HOME} onnxruntime_CUDNN_HOME) if (NOT CMAKE_CUDA_ARCHITECTURES) diff --git a/include/onnxruntime/core/common/common.h b/include/onnxruntime/core/common/common.h index cf6d2138ca..c405bcacca 100644 --- a/include/onnxruntime/core/common/common.h +++ b/include/onnxruntime/core/common/common.h @@ -278,12 +278,6 @@ inline std::wstring ToWideString(const std::wstring& s) { return s; } inline std::string ToWideString(const std::string& s) { return s; } #endif -#if ((__cplusplus >= 201703L) || (defined(_MSVC_LANG) && (_MSVC_LANG >= 201703L))) -#define ORT_IF_CONSTEXPR if constexpr -#else -#define ORT_IF_CONSTEXPR if -#endif - constexpr size_t kMaxStrLen = 2048; // Returns whether `key` is in `container`. diff --git a/include/onnxruntime/core/common/parse_string.h b/include/onnxruntime/core/common/parse_string.h index ba224868e6..edb34724f1 100644 --- a/include/onnxruntime/core/common/parse_string.h +++ b/include/onnxruntime/core/common/parse_string.h @@ -16,7 +16,7 @@ namespace onnxruntime { */ template bool TryParseStringWithClassicLocale(const std::string& str, T& value) { - ORT_IF_CONSTEXPR (std::is_integral::value && std::is_unsigned::value) { + if constexpr (std::is_integral::value && std::is_unsigned::value) { // if T is unsigned integral type, reject negative values which will wrap if (!str.empty() && str[0] == '-') { return false; diff --git a/include/onnxruntime/core/framework/allocator.h b/include/onnxruntime/core/framework/allocator.h index 565cba2b49..d26e331160 100644 --- a/include/onnxruntime/core/framework/allocator.h +++ b/include/onnxruntime/core/framework/allocator.h @@ -146,7 +146,7 @@ class IAllocator { size_t alloc_size = count_or_bytes; // if T is not void, 'count_or_bytes' == number of items so allow for that - ORT_IF_CONSTEXPR(!std::is_void::value) { + if constexpr(!std::is_void::value) { // sizeof(void) isn't valid, but the compiler isn't smart enough to ignore that this line isn't // reachable if T is void. use std::conditional to 'use' void* in the sizeof call if (!CalcMemSizeForArray( diff --git a/include/onnxruntime/core/framework/data_types_internal.h b/include/onnxruntime/core/framework/data_types_internal.h index 95ee9e6175..9d791a7f64 100644 --- a/include/onnxruntime/core/framework/data_types_internal.h +++ b/include/onnxruntime/core/framework/data_types_internal.h @@ -498,7 +498,7 @@ class ContainerChecker { ORT_ENFORCE(++index < c.size(), "Sequence is missing type entry for its element"); constexpr int32_t prim_type = ToTensorProtoElementType(); // Check if this is a primitive type and it matches - ORT_IF_CONSTEXPR(prim_type != ONNX_NAMESPACE::TensorProto_DataType_UNDEFINED) { + if constexpr(prim_type != ONNX_NAMESPACE::TensorProto_DataType_UNDEFINED) { return c[index].IsType(data_types_internal::ContainerType::kTensor) && c[index].IsPrimType(prim_type); } @@ -528,7 +528,7 @@ class ContainerChecker { } ORT_ENFORCE(++index < c.size(), "Map is missing type entry for its value"); constexpr int32_t val_type = ToTensorProtoElementType(); - ORT_IF_CONSTEXPR(val_type != ONNX_NAMESPACE::TensorProto_DataType_UNDEFINED) { + if constexpr(val_type != ONNX_NAMESPACE::TensorProto_DataType_UNDEFINED) { return c[index].IsType(data_types_internal::ContainerType::kTensor) && c[index].IsPrimType(val_type); } diff --git a/include/onnxruntime/core/framework/float16.h b/include/onnxruntime/core/framework/float16.h index ac30ae8238..04a5a9c97b 100644 --- a/include/onnxruntime/core/framework/float16.h +++ b/include/onnxruntime/core/framework/float16.h @@ -69,7 +69,7 @@ struct BFloat16 { val = static_cast((U32 + rounding_bias) >> 16); } #else - ORT_IF_CONSTEXPR(endian::native == endian::little) { + if constexpr(endian::native == endian::little) { std::memcpy(&val, reinterpret_cast(&v) + sizeof(uint16_t), sizeof(uint16_t)); } else { @@ -93,7 +93,7 @@ struct BFloat16 { float result; char* const first = reinterpret_cast(&result); char* const second = first + sizeof(uint16_t); - ORT_IF_CONSTEXPR(endian::native == endian::little) { + if constexpr(endian::native == endian::little) { std::memset(first, 0, sizeof(uint16_t)); std::memcpy(second, &val, sizeof(uint16_t)); } diff --git a/onnxruntime/core/framework/endian_utils.cc b/onnxruntime/core/framework/endian_utils.cc index b4afabeeb9..8b61aad769 100644 --- a/onnxruntime/core/framework/endian_utils.cc +++ b/onnxruntime/core/framework/endian_utils.cc @@ -56,7 +56,7 @@ Status CopyLittleEndian(size_t element_size_in_bytes, ORT_RETURN_IF(source_bytes.size_bytes() != destination_bytes.size_bytes(), "source and destination buffer size mismatch"); - ORT_IF_CONSTEXPR (endian::native == endian::little) { + if constexpr (endian::native == endian::little) { std::memcpy(destination_bytes.data(), source_bytes.data(), source_bytes.size_bytes()); } else { SwapByteOrderCopy(element_size_in_bytes, source_bytes, destination_bytes); diff --git a/onnxruntime/core/framework/tensorprotoutils.cc b/onnxruntime/core/framework/tensorprotoutils.cc index 4acf6cf05a..80a0d17a60 100644 --- a/onnxruntime/core/framework/tensorprotoutils.cc +++ b/onnxruntime/core/framework/tensorprotoutils.cc @@ -779,7 +779,7 @@ ONNXTensorElementDataType GetTensorElementType(const ONNX_NAMESPACE::TensorProto ONNX_NAMESPACE::TensorProto TensorToTensorProto(const Tensor& tensor, const std::string& tensor_proto_name) { // Given we are using the raw_data field in the protobuf, this will work only for little-endian format. - ORT_IF_CONSTEXPR(endian::native != endian::little) { + if constexpr(endian::native != endian::little) { ORT_THROW("Big endian not supported"); } @@ -1127,7 +1127,7 @@ static void SetIndices(gsl::span gathered_indices, auto* ind_dest = reinterpret_cast(raw_indices.data()); size_t dest_index = 0; for (auto src_index : gathered_indices) { - ORT_IF_CONSTEXPR(sizeof(T) == sizeof(int8_t)) { + if constexpr(sizeof(T) == sizeof(int8_t)) { ind_dest[dest_index] = static_cast(src_index); } else { diff --git a/onnxruntime/core/providers/cpu/ml/tree_ensemble_classifier.cc b/onnxruntime/core/providers/cpu/ml/tree_ensemble_classifier.cc index 9a82e9ca76..addbd81244 100644 --- a/onnxruntime/core/providers/cpu/ml/tree_ensemble_classifier.cc +++ b/onnxruntime/core/providers/cpu/ml/tree_ensemble_classifier.cc @@ -32,7 +32,7 @@ ADD_IN_TYPE_TREE_ENSEMBLE_CLASSIFIER_OP(int32_t); template TreeEnsembleClassifier::TreeEnsembleClassifier(const OpKernelInfo& info) : OpKernel(info) { - ORT_IF_CONSTEXPR(std::is_same::value) { + if constexpr(std::is_same::value) { p_tree_ensemble_ = std::make_unique>(); } else { diff --git a/onnxruntime/core/providers/cpu/ml/treeregressor.cc b/onnxruntime/core/providers/cpu/ml/treeregressor.cc index d744c6ce70..7ea0d70ef8 100644 --- a/onnxruntime/core/providers/cpu/ml/treeregressor.cc +++ b/onnxruntime/core/providers/cpu/ml/treeregressor.cc @@ -38,7 +38,7 @@ ONNX_CPU_OPERATOR_TYPED_ML_KERNEL( template TreeEnsembleRegressor::TreeEnsembleRegressor(const OpKernelInfo& info) : OpKernel(info) { - ORT_IF_CONSTEXPR(std::is_same::value) { + if constexpr(std::is_same::value) { p_tree_ensemble_ = std::make_unique>(); } else { diff --git a/onnxruntime/core/providers/cuda/nn/conv.cc b/onnxruntime/core/providers/cuda/nn/conv.cc index 8b0ddc0311..fd0d15640f 100644 --- a/onnxruntime/core/providers/cuda/nn/conv.cc +++ b/onnxruntime/core/providers/cuda/nn/conv.cc @@ -257,7 +257,7 @@ Status Conv::UpdateState(OpKernelContext* context, bool bias_expected) const if (!s_.cached_benchmark_results.contains(x_dims_cudnn)) { // set math type to tensor core before algorithm search - ORT_IF_CONSTEXPR(std::is_same::value) + if constexpr(std::is_same::value) CUDNN_RETURN_IF_ERROR(cudnnSetConvolutionMathType(s_.conv_desc, CUDNN_TENSOR_OP_MATH)); cudnnConvolutionFwdAlgoPerf_t perf; diff --git a/onnxruntime/core/providers/cuda/reduction/reduction_ops.cc b/onnxruntime/core/providers/cuda/reduction/reduction_ops.cc index 3ef51d4bf8..f318ee913f 100644 --- a/onnxruntime/core/providers/cuda/reduction/reduction_ops.cc +++ b/onnxruntime/core/providers/cuda/reduction/reduction_ops.cc @@ -205,7 +205,7 @@ Status ReduceKernel::ReduceKernelShared( } CudnnReduceDescriptor reduce_desc; - ORT_IF_CONSTEXPR (std::is_same::value) + if constexpr (std::is_same::value) ORT_RETURN_IF_ERROR(reduce_desc.Set(cudnn_reduce_op, CudnnTensor::GetDataType(), ReduceTensorIndices)); else ORT_RETURN_IF_ERROR(reduce_desc.Set(cudnn_reduce_op, cudnn_type_X, ReduceTensorIndices)); @@ -524,7 +524,7 @@ Status ReduceComputeCore(CUDAExecutionProvider& cuda_ep, const Tensor& input, Pr } CudnnReduceDescriptor reduce_desc; - ORT_IF_CONSTEXPR (std::is_same::value || std::is_same::value) { + if constexpr (std::is_same::value || std::is_same::value) { ORT_RETURN_IF_ERROR(reduce_desc.Set(cudnn_reduce_op, CudnnTensor::GetDataType(), ReduceTensorIndices)); } else { ORT_RETURN_IF_ERROR(reduce_desc.Set(cudnn_reduce_op, cudnn_type_X, ReduceTensorIndices)); diff --git a/onnxruntime/core/providers/rocm/reduction/reduction_ops.cc b/onnxruntime/core/providers/rocm/reduction/reduction_ops.cc index 7f796e858a..9112156871 100644 --- a/onnxruntime/core/providers/rocm/reduction/reduction_ops.cc +++ b/onnxruntime/core/providers/rocm/reduction/reduction_ops.cc @@ -202,7 +202,7 @@ Status ReduceKernel::ReduceKernelShared( } MiopenReduceDescriptor reduce_desc; - ORT_IF_CONSTEXPR(std::is_same::value) + if constexpr(std::is_same::value) ORT_RETURN_IF_ERROR(reduce_desc.Set(miopen_reduce_op, MiopenTensor::GetDataType(), ReduceTensorIndices)); else ORT_RETURN_IF_ERROR(reduce_desc.Set(miopen_reduce_op, miopen_type_X, ReduceTensorIndices)); @@ -523,7 +523,7 @@ Status ReduceComputeCore(ROCMExecutionProvider& rocm_ep, const Tensor& input, Pr } MiopenReduceDescriptor reduce_desc; - ORT_IF_CONSTEXPR (std::is_same::value || std::is_same::value) { + if constexpr (std::is_same::value || std::is_same::value) { ORT_RETURN_IF_ERROR(reduce_desc.Set(miopen_reduce_op, MiopenTensor::GetDataType(), ReduceTensorIndices)); } else { ORT_RETURN_IF_ERROR(reduce_desc.Set(miopen_reduce_op, miopen_type_X, ReduceTensorIndices)); diff --git a/onnxruntime/test/framework/sparse_kernels_test.cc b/onnxruntime/test/framework/sparse_kernels_test.cc index 54cb51cff2..a049af3edc 100644 --- a/onnxruntime/test/framework/sparse_kernels_test.cc +++ b/onnxruntime/test/framework/sparse_kernels_test.cc @@ -698,7 +698,7 @@ struct InsertIndices { std::vector indices_data; insert_indices_data(indices_1D, values_size, shape_size, indices_data, indices_tp); indices_tp.set_data_type(utils::ToTensorProtoElementType()); - ORT_IF_CONSTEXPR(sizeof(T) == sizeof(int8_t)) { + if constexpr(sizeof(T) == sizeof(int8_t)) { indices_tp.mutable_raw_data()->assign(reinterpret_cast(indices_data.data()), indices_data.size()); } else { diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-cuda-11-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-cuda-11-pipeline.yml index 809a72087e..90a40f1b66 100644 --- a/tools/ci_build/github/azure-pipelines/linux-gpu-cuda-11-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/linux-gpu-cuda-11-pipeline.yml @@ -1,5 +1,3 @@ -#This file is for CUDA 10.2 and 11.4 even the filename just says 11 - resources: repositories: - repository: manylinux # The name used to reference this repository in the checkout step @@ -9,63 +7,6 @@ resources: ref: a8099af1b3e25f0489717ad9c4f9a2e25a8c5b36 jobs: -- job: Linux_Build_CUDA10_NV6 - timeoutInMinutes: 180 - workspace: - clean: all - pool: Onnxruntime-Linux-GPU-NV6 - steps: - - checkout: self - clean: true - submodules: recursive - - - template: templates/get-docker-image-steps.yml - parameters: - Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_cuda11 - Context: tools/ci_build/github/linux/docker - DockerBuildArgs: "--network=host --build-arg POLICY=manylinux2014 --build-arg PLATFORM=x86_64 --build-arg BASEIMAGE=nvcr.io/nvidia/cuda:10.2-cudnn8-devel-centos7 --build-arg DEVTOOLSET_ROOTPATH=/opt/rh/devtoolset-8/root --build-arg PREPEND_PATH=/opt/rh/devtoolset-8/root/usr/bin: --build-arg LD_LIBRARY_PATH_ARG=/opt/rh/devtoolset-8/root/usr/lib64:/opt/rh/devtoolset-8/root/usr/lib:/opt/rh/devtoolset-8/root/usr/lib64/dyninst:/opt/rh/devtoolset-8/root/usr/lib/dyninst:/usr/local/lib64 --build-arg BUILD_UID=$( id -u )" - Repository: onnxruntimegpubuild - - task: CmdLine@2 - inputs: - script: | - mkdir -p $HOME/.onnx - docker run --gpus all -e CC=/opt/rh/devtoolset-8/root/usr/bin/cc -e CXX=/opt/rh/devtoolset-8/root/usr/bin/c++ -e CFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" -e CXXFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" --rm \ - --volume /data/onnx:/data/onnx:ro \ - --volume $(Build.SourcesDirectory):/onnxruntime_src \ - --volume $(Build.BinariesDirectory):/build \ - --volume /data/models:/build/models:ro \ - --volume $HOME/.onnx:/home/onnxruntimedev/.onnx \ - -e ALLOW_RELEASED_ONNX_OPSET_ONLY=0 \ - -e NIGHTLY_BUILD \ - -e BUILD_BUILDNUMBER \ - onnxruntimegpubuild \ - /opt/python/cp37-cp37m/bin/python3 /onnxruntime_src/tools/ci_build/build.py \ - --build_dir /build --cmake_generator Ninja \ - --config Release \ - --skip_submodule_sync \ - --build_shared_lib \ - --parallel \ - --build_wheel \ - --enable_onnx_tests --use_cuda --cuda_version=10.2 --cuda_home=/usr/local/cuda-10.2 --cudnn_home=/usr/local/cuda-10.2 \ - --enable_pybind --build_java --build_nodejs \ - --cmake_extra_defines CMAKE_CUDA_HOST_COMPILER=/opt/rh/devtoolset-8/root/usr/bin/cc CMAKE_CUDA_ARCHITECTURES=52 - workingDirectory: $(Build.SourcesDirectory) - - task: PublishTestResults@2 - displayName: 'Publish unit test results' - inputs: - testResultsFiles: '**/*.results.xml' - searchFolder: '$(Build.BinariesDirectory)' - testRunTitle: 'Unit Test Run' - condition: succeededOrFailed() - - - template: templates/component-governance-component-detection-steps.yml - parameters: - condition: 'succeeded' - - - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3 - displayName: 'Clean Agent Directories' - condition: always() - - job: Linux_Build_CUDA11_NV6 timeoutInMinutes: 180 workspace: diff --git a/tools/ci_build/github/azure-pipelines/win-gpu-reduce-op-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-gpu-reduce-op-ci-pipeline.yml index 7dda81a09c..bdb28c9cba 100644 --- a/tools/ci_build/github/azure-pipelines/win-gpu-reduce-op-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/win-gpu-reduce-op-ci-pipeline.yml @@ -13,7 +13,7 @@ jobs: MsbuildArguments: '-detailedsummary -maxcpucount -consoleloggerparameters:PerformanceSummary' OnnxRuntimeBuildDirectory: '$(Build.BinariesDirectory)' DOTNET_SKIP_FIRST_TIME_EXPERIENCE: true - EnvSetupScript: setup_env_cuda.bat + EnvSetupScript: setup_env_cuda_11.bat buildArch: x64 setVcvars: true timeoutInMinutes: 120 @@ -52,7 +52,7 @@ jobs: displayName: 'Build and test' inputs: scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py' - arguments: '--config $(BuildConfig) --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --cmake_generator "Visual Studio 16 2019" --build_wheel --use_cuda --cuda_version=10.2 --cuda_home="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2" --cudnn_home="C:\local\cudnn-10.2-windows10-x64-v8.0.3.33\cuda" --cmake_extra_defines CMAKE_SYSTEM_VERSION=10.0.18362.0 "CMAKE_CUDA_ARCHITECTURES=52" --include_ops_by_config="$(Build.SourcesDirectory)\onnxruntime\test\testdata\required_ops.config"' + arguments: '--config $(BuildConfig) --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --cmake_generator "Visual Studio 16 2019" --build_wheel --use_cuda --cuda_version=11.4 --cuda_home="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.4" --cudnn_home="C:\local\cudnn-11.4-windows-x64-v8.2.2.26\cuda" --cmake_extra_defines CMAKE_SYSTEM_VERSION=10.0.18362.0 "CMAKE_CUDA_ARCHITECTURES=52" --include_ops_by_config="$(Build.SourcesDirectory)\onnxruntime\test\testdata\required_ops.config"' workingDirectory: '$(Build.BinariesDirectory)' - template: templates/component-governance-component-detection-steps.yml diff --git a/tools/ci_build/github/windows/setup_env_cuda.bat b/tools/ci_build/github/windows/setup_env_cuda.bat deleted file mode 100644 index 5af5e502f2..0000000000 --- a/tools/ci_build/github/windows/setup_env_cuda.bat +++ /dev/null @@ -1,2 +0,0 @@ -set PATH=C:\azcopy;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2\extras\CUPTI\lib64;%PATH% -set GRADLE_OPTS=-Dorg.gradle.daemon=false