Remove CUDA 10.2 support (#12541)

2026-07-23 19:32:23 +00:00 · 2022-08-10 22:46:41 -07:00 · 2022-08-10 22:46:41 -07:00 · ac7538b909
commit ac7538b909
parent 819c36701f
17 changed files with 23 additions and 95 deletions
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@ -1245,6 +1245,9 @@ endif()
 function(onnxruntime_set_compile_flags target_name)
    target_compile_definitions(${target_name} PUBLIC EIGEN_USE_THREADS)
    if (MSVC)
+      foreach(CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORY ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
+        target_compile_options(${target_name} PRIVATE "$<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:/external:I${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORY}>")
+      endforeach()
      target_compile_definitions(${target_name} PUBLIC -DPLATFORM_WINDOWS -DNOGDI -DNOMINMAX -D_USE_MATH_DEFINES -D_SILENCE_ALL_CXX17_DEPRECATION_WARNINGS)
      if (onnxruntime_ENABLE_MEMLEAK_CHECKER)
        target_compile_definitions(${target_name} PUBLIC -DONNXRUNTIME_ENABLE_MEMLEAK_CHECK)
@ -1794,15 +1797,7 @@ if (onnxruntime_USE_CUDA)
  set(CMAKE_CUDA_RUNTIME_LIBRARY Shared)
  enable_language(CUDA)
  message( STATUS "CMAKE_CUDA_COMPILER_VERSION: ${CMAKE_CUDA_COMPILER_VERSION}")
-  if (WIN32)
-    set(CMAKE_CUDA_STANDARD 17)
-    foreach(CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORY ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
-      string(APPEND CMAKE_CXX_FLAGS " /external:I\"${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORY}\"")
-    endforeach()
-  else()
-    #CUDA 10.2 on Linux doesn't support C++17
-    set(CMAKE_CUDA_STANDARD 14)
-  endif()
+  set(CMAKE_CUDA_STANDARD 17)
  file(TO_CMAKE_PATH ${onnxruntime_CUDNN_HOME} onnxruntime_CUDNN_HOME)

  if (NOT CMAKE_CUDA_ARCHITECTURES)
--- a/include/onnxruntime/core/common/common.h
+++ b/include/onnxruntime/core/common/common.h
@ -278,12 +278,6 @@ inline std::wstring ToWideString(const std::wstring& s) { return s; }
 inline std::string ToWideString(const std::string& s) { return s; }
 #endif

-#if ((__cplusplus >= 201703L) || (defined(_MSVC_LANG) && (_MSVC_LANG >= 201703L)))
-#define ORT_IF_CONSTEXPR if constexpr
-#else
-#define ORT_IF_CONSTEXPR if
-#endif
-
 constexpr size_t kMaxStrLen = 2048;

 // Returns whether `key` is in `container`.
--- a/include/onnxruntime/core/common/parse_string.h
+++ b/include/onnxruntime/core/common/parse_string.h
@ -16,7 +16,7 @@ namespace onnxruntime {
 */
 template <typename T>
 bool TryParseStringWithClassicLocale(const std::string& str, T& value) {
-  ORT_IF_CONSTEXPR (std::is_integral<T>::value && std::is_unsigned<T>::value) {
+  if constexpr (std::is_integral<T>::value && std::is_unsigned<T>::value) {
    // if T is unsigned integral type, reject negative values which will wrap
    if (!str.empty() && str[0] == '-') {
      return false;
--- a/include/onnxruntime/core/framework/allocator.h
+++ b/include/onnxruntime/core/framework/allocator.h
@ -146,7 +146,7 @@ class IAllocator {
    size_t alloc_size = count_or_bytes;

    // if T is not void, 'count_or_bytes' == number of items so allow for that
-    ORT_IF_CONSTEXPR(!std::is_void<T>::value) {
+    if constexpr(!std::is_void<T>::value) {
      // sizeof(void) isn't valid, but the compiler isn't smart enough to ignore that this line isn't
      // reachable if T is void. use std::conditional to 'use' void* in the sizeof call
      if (!CalcMemSizeForArray(
--- a/include/onnxruntime/core/framework/data_types_internal.h
+++ b/include/onnxruntime/core/framework/data_types_internal.h
@ -498,7 +498,7 @@ class ContainerChecker {
        ORT_ENFORCE(++index < c.size(), "Sequence is missing type entry for its element");
        constexpr int32_t prim_type = ToTensorProtoElementType<T>();
        // Check if this is a primitive type and it matches
-        ORT_IF_CONSTEXPR(prim_type != ONNX_NAMESPACE::TensorProto_DataType_UNDEFINED) {
+        if constexpr(prim_type != ONNX_NAMESPACE::TensorProto_DataType_UNDEFINED) {
          return c[index].IsType(data_types_internal::ContainerType::kTensor) &&
                 c[index].IsPrimType(prim_type);
        }
@ -528,7 +528,7 @@ class ContainerChecker {
      }
      ORT_ENFORCE(++index < c.size(), "Map is missing type entry for its value");
      constexpr int32_t val_type = ToTensorProtoElementType<V>();
-      ORT_IF_CONSTEXPR(val_type != ONNX_NAMESPACE::TensorProto_DataType_UNDEFINED) {
+      if constexpr(val_type != ONNX_NAMESPACE::TensorProto_DataType_UNDEFINED) {
        return c[index].IsType(data_types_internal::ContainerType::kTensor) &&
               c[index].IsPrimType(val_type);
      }
--- a/include/onnxruntime/core/framework/float16.h
+++ b/include/onnxruntime/core/framework/float16.h
@ -69,7 +69,7 @@ struct BFloat16 {
      val = static_cast<uint16_t>((U32 + rounding_bias) >> 16);
    }
 #else
-    ORT_IF_CONSTEXPR(endian::native == endian::little) {
+    if constexpr(endian::native == endian::little) {
      std::memcpy(&val, reinterpret_cast<char*>(&v) + sizeof(uint16_t), sizeof(uint16_t));
    }
    else {
@ -93,7 +93,7 @@ struct BFloat16 {
    float result;
    char* const first = reinterpret_cast<char*>(&result);
    char* const second = first + sizeof(uint16_t);
-    ORT_IF_CONSTEXPR(endian::native == endian::little) {
+    if constexpr(endian::native == endian::little) {
      std::memset(first, 0, sizeof(uint16_t));
      std::memcpy(second, &val, sizeof(uint16_t));
    }
--- a/onnxruntime/core/framework/endian_utils.cc
+++ b/onnxruntime/core/framework/endian_utils.cc
@ -56,7 +56,7 @@ Status CopyLittleEndian(size_t element_size_in_bytes,
  ORT_RETURN_IF(source_bytes.size_bytes() != destination_bytes.size_bytes(),
                "source and destination buffer size mismatch");

-  ORT_IF_CONSTEXPR (endian::native == endian::little) {
+  if constexpr (endian::native == endian::little) {
    std::memcpy(destination_bytes.data(), source_bytes.data(), source_bytes.size_bytes());
  } else {
    SwapByteOrderCopy(element_size_in_bytes, source_bytes, destination_bytes);
--- a/onnxruntime/core/framework/tensorprotoutils.cc
+++ b/onnxruntime/core/framework/tensorprotoutils.cc
@ -779,7 +779,7 @@ ONNXTensorElementDataType GetTensorElementType(const ONNX_NAMESPACE::TensorProto

 ONNX_NAMESPACE::TensorProto TensorToTensorProto(const Tensor& tensor, const std::string& tensor_proto_name) {
  // Given we are using the raw_data field in the protobuf, this will work only for little-endian format.
-  ORT_IF_CONSTEXPR(endian::native != endian::little) {
+  if constexpr(endian::native != endian::little) {
    ORT_THROW("Big endian not supported");
  }

@ -1127,7 +1127,7 @@ static void SetIndices(gsl::span<int64_t> gathered_indices,
  auto* ind_dest = reinterpret_cast<T*>(raw_indices.data());
  size_t dest_index = 0;
  for (auto src_index : gathered_indices) {
-    ORT_IF_CONSTEXPR(sizeof(T) == sizeof(int8_t)) {
+    if constexpr(sizeof(T) == sizeof(int8_t)) {
      ind_dest[dest_index] = static_cast<T>(src_index);
    }
    else {
--- a/onnxruntime/core/providers/cpu/ml/tree_ensemble_classifier.cc
+++ b/onnxruntime/core/providers/cpu/ml/tree_ensemble_classifier.cc
@ -32,7 +32,7 @@ ADD_IN_TYPE_TREE_ENSEMBLE_CLASSIFIER_OP(int32_t);

 template <typename T>
 TreeEnsembleClassifier<T>::TreeEnsembleClassifier(const OpKernelInfo& info) : OpKernel(info) {
-  ORT_IF_CONSTEXPR(std::is_same<T, double>::value) {
+  if constexpr(std::is_same<T, double>::value) {
    p_tree_ensemble_ = std::make_unique<detail::TreeEnsembleCommonClassifier<T, double, OutputType>>();
  }
  else {
--- a/onnxruntime/core/providers/cpu/ml/treeregressor.cc
+++ b/onnxruntime/core/providers/cpu/ml/treeregressor.cc
@ -38,7 +38,7 @@ ONNX_CPU_OPERATOR_TYPED_ML_KERNEL(

 template <typename T>
 TreeEnsembleRegressor<T>::TreeEnsembleRegressor(const OpKernelInfo& info) : OpKernel(info) {
-  ORT_IF_CONSTEXPR(std::is_same<T, double>::value) {
+  if constexpr(std::is_same<T, double>::value) {
    p_tree_ensemble_ = std::make_unique<detail::TreeEnsembleCommon<T, double, OutputType>>();
  }
  else {
--- a/onnxruntime/core/providers/cuda/nn/conv.cc
+++ b/onnxruntime/core/providers/cuda/nn/conv.cc
@ -257,7 +257,7 @@ Status Conv<T>::UpdateState(OpKernelContext* context, bool bias_expected) const

    if (!s_.cached_benchmark_results.contains(x_dims_cudnn)) {
      // set math type to tensor core before algorithm search
-      ORT_IF_CONSTEXPR(std::is_same<T, MLFloat16>::value)
+      if constexpr(std::is_same<T, MLFloat16>::value)
        CUDNN_RETURN_IF_ERROR(cudnnSetConvolutionMathType(s_.conv_desc, CUDNN_TENSOR_OP_MATH));

      cudnnConvolutionFwdAlgoPerf_t perf;
--- a/onnxruntime/core/providers/cuda/reduction/reduction_ops.cc
+++ b/onnxruntime/core/providers/cuda/reduction/reduction_ops.cc
@ -205,7 +205,7 @@ Status ReduceKernel<allow_multi_axes>::ReduceKernelShared(
  }

  CudnnReduceDescriptor reduce_desc;
-  ORT_IF_CONSTEXPR (std::is_same<T, MLFloat16>::value)
+  if constexpr (std::is_same<T, MLFloat16>::value)
    ORT_RETURN_IF_ERROR(reduce_desc.Set(cudnn_reduce_op, CudnnTensor::GetDataType<float>(), ReduceTensorIndices));
  else
    ORT_RETURN_IF_ERROR(reduce_desc.Set(cudnn_reduce_op, cudnn_type_X, ReduceTensorIndices));
@ -524,7 +524,7 @@ Status ReduceComputeCore(CUDAExecutionProvider& cuda_ep, const Tensor& input, Pr
  }

  CudnnReduceDescriptor reduce_desc;
-  ORT_IF_CONSTEXPR (std::is_same<T, MLFloat16>::value || std::is_same<T, BFloat16>::value) {
+  if constexpr (std::is_same<T, MLFloat16>::value || std::is_same<T, BFloat16>::value) {
    ORT_RETURN_IF_ERROR(reduce_desc.Set(cudnn_reduce_op, CudnnTensor::GetDataType<float>(), ReduceTensorIndices));
  } else {
    ORT_RETURN_IF_ERROR(reduce_desc.Set(cudnn_reduce_op, cudnn_type_X, ReduceTensorIndices));
--- a/onnxruntime/core/providers/rocm/reduction/reduction_ops.cc
+++ b/onnxruntime/core/providers/rocm/reduction/reduction_ops.cc
@ -202,7 +202,7 @@ Status ReduceKernel<allow_multi_axes>::ReduceKernelShared(
  }

  MiopenReduceDescriptor reduce_desc;
-  ORT_IF_CONSTEXPR(std::is_same<T, MLFloat16>::value)
+  if constexpr(std::is_same<T, MLFloat16>::value)
  ORT_RETURN_IF_ERROR(reduce_desc.Set(miopen_reduce_op, MiopenTensor::GetDataType<float>(), ReduceTensorIndices));
  else ORT_RETURN_IF_ERROR(reduce_desc.Set(miopen_reduce_op, miopen_type_X, ReduceTensorIndices));

@ -523,7 +523,7 @@ Status ReduceComputeCore(ROCMExecutionProvider& rocm_ep, const Tensor& input, Pr
  }

  MiopenReduceDescriptor reduce_desc;
-  ORT_IF_CONSTEXPR (std::is_same<T, MLFloat16>::value || std::is_same<T, BFloat16>::value) {
+  if constexpr (std::is_same<T, MLFloat16>::value || std::is_same<T, BFloat16>::value) {
    ORT_RETURN_IF_ERROR(reduce_desc.Set(miopen_reduce_op, MiopenTensor::GetDataType<float>(), ReduceTensorIndices));
  } else {
    ORT_RETURN_IF_ERROR(reduce_desc.Set(miopen_reduce_op, miopen_type_X, ReduceTensorIndices));
--- a/onnxruntime/test/framework/sparse_kernels_test.cc
+++ b/onnxruntime/test/framework/sparse_kernels_test.cc
@ -698,7 +698,7 @@ struct InsertIndices {
    std::vector<int8_t> indices_data;
    insert_indices_data(indices_1D, values_size, shape_size, indices_data, indices_tp);
    indices_tp.set_data_type(utils::ToTensorProtoElementType<T>());
-    ORT_IF_CONSTEXPR(sizeof(T) == sizeof(int8_t)) {
+    if constexpr(sizeof(T) == sizeof(int8_t)) {
      indices_tp.mutable_raw_data()->assign(reinterpret_cast<const char*>(indices_data.data()), indices_data.size());
    }
    else {
--- a/tools/ci_build/github/azure-pipelines/linux-gpu-cuda-11-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-gpu-cuda-11-pipeline.yml
@ -1,5 +1,3 @@
-#This file is for CUDA 10.2 and 11.4 even the filename just says 11
-
 resources:
  repositories:
  - repository: manylinux # The name used to reference this repository in the checkout step
@ -9,63 +7,6 @@ resources:
    ref: a8099af1b3e25f0489717ad9c4f9a2e25a8c5b36

 jobs:
- job: Linux_Build_CUDA10_NV6
-  timeoutInMinutes: 180
-  workspace:
-    clean: all
-  pool: Onnxruntime-Linux-GPU-NV6
-  steps:
-  - checkout: self
-    clean: true
-    submodules: recursive
-
-  - template: templates/get-docker-image-steps.yml
-    parameters:
-      Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_cuda11
-      Context: tools/ci_build/github/linux/docker
-      DockerBuildArgs: "--network=host --build-arg POLICY=manylinux2014 --build-arg PLATFORM=x86_64 --build-arg BASEIMAGE=nvcr.io/nvidia/cuda:10.2-cudnn8-devel-centos7 --build-arg DEVTOOLSET_ROOTPATH=/opt/rh/devtoolset-8/root --build-arg PREPEND_PATH=/opt/rh/devtoolset-8/root/usr/bin: --build-arg LD_LIBRARY_PATH_ARG=/opt/rh/devtoolset-8/root/usr/lib64:/opt/rh/devtoolset-8/root/usr/lib:/opt/rh/devtoolset-8/root/usr/lib64/dyninst:/opt/rh/devtoolset-8/root/usr/lib/dyninst:/usr/local/lib64 --build-arg BUILD_UID=$( id -u )"
-      Repository: onnxruntimegpubuild
-  - task: CmdLine@2
-    inputs:
-      script: |
-        mkdir -p $HOME/.onnx
-        docker run --gpus all -e CC=/opt/rh/devtoolset-8/root/usr/bin/cc -e CXX=/opt/rh/devtoolset-8/root/usr/bin/c++ -e CFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" -e CXXFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" --rm \
-          --volume /data/onnx:/data/onnx:ro \
-          --volume $(Build.SourcesDirectory):/onnxruntime_src \
-          --volume $(Build.BinariesDirectory):/build \
-          --volume /data/models:/build/models:ro \
-          --volume $HOME/.onnx:/home/onnxruntimedev/.onnx \
-          -e ALLOW_RELEASED_ONNX_OPSET_ONLY=0 \
-          -e NIGHTLY_BUILD \
-          -e BUILD_BUILDNUMBER \
-          onnxruntimegpubuild \
-            /opt/python/cp37-cp37m/bin/python3 /onnxruntime_src/tools/ci_build/build.py \
-              --build_dir /build --cmake_generator Ninja \
-              --config Release \
-              --skip_submodule_sync \
-              --build_shared_lib \
-              --parallel \
-              --build_wheel \
-              --enable_onnx_tests --use_cuda --cuda_version=10.2 --cuda_home=/usr/local/cuda-10.2 --cudnn_home=/usr/local/cuda-10.2 \
-              --enable_pybind --build_java --build_nodejs \
-              --cmake_extra_defines CMAKE_CUDA_HOST_COMPILER=/opt/rh/devtoolset-8/root/usr/bin/cc  CMAKE_CUDA_ARCHITECTURES=52
-      workingDirectory: $(Build.SourcesDirectory)
-  - task: PublishTestResults@2
-    displayName: 'Publish unit test results'
-    inputs:
-      testResultsFiles: '**/*.results.xml'
-      searchFolder: '$(Build.BinariesDirectory)'
-      testRunTitle: 'Unit Test Run'
-    condition: succeededOrFailed()
-
-  - template: templates/component-governance-component-detection-steps.yml
-    parameters:
-      condition: 'succeeded'
-
-  - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
-    displayName: 'Clean Agent Directories'
-    condition: always()
-    
 - job: Linux_Build_CUDA11_NV6
  timeoutInMinutes: 180
  workspace:
--- a/tools/ci_build/github/azure-pipelines/win-gpu-reduce-op-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/win-gpu-reduce-op-ci-pipeline.yml
@ -13,7 +13,7 @@ jobs:
    MsbuildArguments: '-detailedsummary -maxcpucount -consoleloggerparameters:PerformanceSummary'
    OnnxRuntimeBuildDirectory: '$(Build.BinariesDirectory)'
    DOTNET_SKIP_FIRST_TIME_EXPERIENCE: true
-    EnvSetupScript: setup_env_cuda.bat
+    EnvSetupScript: setup_env_cuda_11.bat
    buildArch: x64
    setVcvars: true
  timeoutInMinutes: 120
@ -52,7 +52,7 @@ jobs:
    displayName: 'Build and test'
    inputs:
      scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py'
-      arguments: '--config $(BuildConfig) --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --cmake_generator "Visual Studio 16 2019" --build_wheel --use_cuda --cuda_version=10.2 --cuda_home="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2" --cudnn_home="C:\local\cudnn-10.2-windows10-x64-v8.0.3.33\cuda" --cmake_extra_defines CMAKE_SYSTEM_VERSION=10.0.18362.0 "CMAKE_CUDA_ARCHITECTURES=52" --include_ops_by_config="$(Build.SourcesDirectory)\onnxruntime\test\testdata\required_ops.config"'
+      arguments: '--config $(BuildConfig) --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --cmake_generator "Visual Studio 16 2019" --build_wheel --use_cuda --cuda_version=11.4 --cuda_home="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.4" --cudnn_home="C:\local\cudnn-11.4-windows-x64-v8.2.2.26\cuda" --cmake_extra_defines CMAKE_SYSTEM_VERSION=10.0.18362.0 "CMAKE_CUDA_ARCHITECTURES=52" --include_ops_by_config="$(Build.SourcesDirectory)\onnxruntime\test\testdata\required_ops.config"'
      workingDirectory: '$(Build.BinariesDirectory)'

  - template: templates/component-governance-component-detection-steps.yml
--- a/tools/ci_build/github/windows/setup_env_cuda.bat
+++ b/tools/ci_build/github/windows/setup_env_cuda.bat
@ -1,2 +0,0 @@
-set PATH=C:\azcopy;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2\extras\CUPTI\lib64;%PATH%
-set GRADLE_OPTS=-Dorg.gradle.daemon=false