From ac7538b909990f89f9bfebb325cd53c3d31cf158 Mon Sep 17 00:00:00 2001
From: Changming Sun <chasun@microsoft.com>
Date: Wed, 10 Aug 2022 22:46:41 -0700
Subject: [PATCH] Remove CUDA 10.2 support (#12541)

---
 cmake/CMakeLists.txt                          | 13 ++--
 include/onnxruntime/core/common/common.h      |  6 --
 .../onnxruntime/core/common/parse_string.h    |  2 +-
 .../onnxruntime/core/framework/allocator.h    |  2 +-
 .../core/framework/data_types_internal.h      |  4 +-
 include/onnxruntime/core/framework/float16.h  |  4 +-
 onnxruntime/core/framework/endian_utils.cc    |  2 +-
 .../core/framework/tensorprotoutils.cc        |  4 +-
 .../cpu/ml/tree_ensemble_classifier.cc        |  2 +-
 .../core/providers/cpu/ml/treeregressor.cc    |  2 +-
 onnxruntime/core/providers/cuda/nn/conv.cc    |  2 +-
 .../providers/cuda/reduction/reduction_ops.cc |  4 +-
 .../providers/rocm/reduction/reduction_ops.cc |  4 +-
 .../test/framework/sparse_kernels_test.cc     |  2 +-
 .../linux-gpu-cuda-11-pipeline.yml            | 59 -------------------
 .../win-gpu-reduce-op-ci-pipeline.yml         |  4 +-
 .../github/windows/setup_env_cuda.bat         |  2 -
 17 files changed, 23 insertions(+), 95 deletions(-)
 delete mode 100644 tools/ci_build/github/windows/setup_env_cuda.bat
diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index 2395641eb5..ac2277eb19 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -1245,6 +1245,9 @@ endif()
 function(onnxruntime_set_compile_flags target_name)
     target_compile_definitions(${target_name} PUBLIC EIGEN_USE_THREADS)
     if (MSVC)
+      foreach(CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORY ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
+        target_compile_options(${target_name} PRIVATE "$<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:/external:I${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORY}>")
+      endforeach()
       target_compile_definitions(${target_name} PUBLIC -DPLATFORM_WINDOWS -DNOGDI -DNOMINMAX -D_USE_MATH_DEFINES -D_SILENCE_ALL_CXX17_DEPRECATION_WARNINGS)
       if (onnxruntime_ENABLE_MEMLEAK_CHECKER)
         target_compile_definitions(${target_name} PUBLIC -DONNXRUNTIME_ENABLE_MEMLEAK_CHECK)
@@ -1794,15 +1797,7 @@ if (onnxruntime_USE_CUDA)
   set(CMAKE_CUDA_RUNTIME_LIBRARY Shared)
   enable_language(CUDA)
   message( STATUS "CMAKE_CUDA_COMPILER_VERSION: ${CMAKE_CUDA_COMPILER_VERSION}")
-  if (WIN32)
-    set(CMAKE_CUDA_STANDARD 17)
-    foreach(CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORY ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
-      string(APPEND CMAKE_CXX_FLAGS " /external:I\"${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORY}\"")
-    endforeach()
-  else()
-    #CUDA 10.2 on Linux doesn't support C++17
-    set(CMAKE_CUDA_STANDARD 14)
-  endif()
+  set(CMAKE_CUDA_STANDARD 17)
   file(TO_CMAKE_PATH ${onnxruntime_CUDNN_HOME} onnxruntime_CUDNN_HOME)
 
   if (NOT CMAKE_CUDA_ARCHITECTURES)
diff --git a/include/onnxruntime/core/common/common.h b/include/onnxruntime/core/common/common.h
index cf6d2138ca..c405bcacca 100644
--- a/include/onnxruntime/core/common/common.h
+++ b/include/onnxruntime/core/common/common.h
@@ -278,12 +278,6 @@ inline std::wstring ToWideString(const std::wstring& s) { return s; }
 inline std::string ToWideString(const std::string& s) { return s; }
 #endif
 
-#if ((__cplusplus >= 201703L) || (defined(_MSVC_LANG) && (_MSVC_LANG >= 201703L)))
-#define ORT_IF_CONSTEXPR if constexpr
-#else
-#define ORT_IF_CONSTEXPR if
-#endif
-
 constexpr size_t kMaxStrLen = 2048;
 
 // Returns whether `key` is in `container`.
diff --git a/include/onnxruntime/core/common/parse_string.h b/include/onnxruntime/core/common/parse_string.h
index ba224868e6..edb34724f1 100644
--- a/include/onnxruntime/core/common/parse_string.h
+++ b/include/onnxruntime/core/common/parse_string.h
@@ -16,7 +16,7 @@ namespace onnxruntime {
  */
 template <typename T>
 bool TryParseStringWithClassicLocale(const std::string& str, T& value) {
-  ORT_IF_CONSTEXPR (std::is_integral<T>::value && std::is_unsigned<T>::value) {
+  if constexpr (std::is_integral<T>::value && std::is_unsigned<T>::value) {
     // if T is unsigned integral type, reject negative values which will wrap
     if (!str.empty() && str[0] == '-') {
       return false;
diff --git a/include/onnxruntime/core/framework/allocator.h b/include/onnxruntime/core/framework/allocator.h
index 565cba2b49..d26e331160 100644
--- a/include/onnxruntime/core/framework/allocator.h
+++ b/include/onnxruntime/core/framework/allocator.h
@@ -146,7 +146,7 @@ class IAllocator {
     size_t alloc_size = count_or_bytes;
 
     // if T is not void, 'count_or_bytes' == number of items so allow for that
-    ORT_IF_CONSTEXPR(!std::is_void<T>::value) {
+    if constexpr(!std::is_void<T>::value) {
       // sizeof(void) isn't valid, but the compiler isn't smart enough to ignore that this line isn't
       // reachable if T is void. use std::conditional to 'use' void* in the sizeof call
       if (!CalcMemSizeForArray(
diff --git a/include/onnxruntime/core/framework/data_types_internal.h b/include/onnxruntime/core/framework/data_types_internal.h
index 95ee9e6175..9d791a7f64 100644
--- a/include/onnxruntime/core/framework/data_types_internal.h
+++ b/include/onnxruntime/core/framework/data_types_internal.h
@@ -498,7 +498,7 @@ class ContainerChecker {
         ORT_ENFORCE(++index < c.size(), "Sequence is missing type entry for its element");
         constexpr int32_t prim_type = ToTensorProtoElementType<T>();
         // Check if this is a primitive type and it matches
-        ORT_IF_CONSTEXPR(prim_type != ONNX_NAMESPACE::TensorProto_DataType_UNDEFINED) {
+        if constexpr(prim_type != ONNX_NAMESPACE::TensorProto_DataType_UNDEFINED) {
           return c[index].IsType(data_types_internal::ContainerType::kTensor) &&
                  c[index].IsPrimType(prim_type);
         }
@@ -528,7 +528,7 @@ class ContainerChecker {
       }
       ORT_ENFORCE(++index < c.size(), "Map is missing type entry for its value");
       constexpr int32_t val_type = ToTensorProtoElementType<V>();
-      ORT_IF_CONSTEXPR(val_type != ONNX_NAMESPACE::TensorProto_DataType_UNDEFINED) {
+      if constexpr(val_type != ONNX_NAMESPACE::TensorProto_DataType_UNDEFINED) {
         return c[index].IsType(data_types_internal::ContainerType::kTensor) &&
                c[index].IsPrimType(val_type);
       }
diff --git a/include/onnxruntime/core/framework/float16.h b/include/onnxruntime/core/framework/float16.h
index ac30ae8238..04a5a9c97b 100644
--- a/include/onnxruntime/core/framework/float16.h
+++ b/include/onnxruntime/core/framework/float16.h
@@ -69,7 +69,7 @@ struct BFloat16 {
       val = static_cast<uint16_t>((U32 + rounding_bias) >> 16);
     }
 #else
-    ORT_IF_CONSTEXPR(endian::native == endian::little) {
+    if constexpr(endian::native == endian::little) {
       std::memcpy(&val, reinterpret_cast<char*>(&v) + sizeof(uint16_t), sizeof(uint16_t));
     }
     else {
@@ -93,7 +93,7 @@ struct BFloat16 {
     float result;
     char* const first = reinterpret_cast<char*>(&result);
     char* const second = first + sizeof(uint16_t);
-    ORT_IF_CONSTEXPR(endian::native == endian::little) {
+    if constexpr(endian::native == endian::little) {
       std::memset(first, 0, sizeof(uint16_t));
       std::memcpy(second, &val, sizeof(uint16_t));
     }
diff --git a/onnxruntime/core/framework/endian_utils.cc b/onnxruntime/core/framework/endian_utils.cc
index b4afabeeb9..8b61aad769 100644
--- a/onnxruntime/core/framework/endian_utils.cc
+++ b/onnxruntime/core/framework/endian_utils.cc
@@ -56,7 +56,7 @@ Status CopyLittleEndian(size_t element_size_in_bytes,
   ORT_RETURN_IF(source_bytes.size_bytes() != destination_bytes.size_bytes(),
                 "source and destination buffer size mismatch");
 
-  ORT_IF_CONSTEXPR (endian::native == endian::little) {
+  if constexpr (endian::native == endian::little) {
     std::memcpy(destination_bytes.data(), source_bytes.data(), source_bytes.size_bytes());
   } else {
     SwapByteOrderCopy(element_size_in_bytes, source_bytes, destination_bytes);
diff --git a/onnxruntime/core/framework/tensorprotoutils.cc b/onnxruntime/core/framework/tensorprotoutils.cc
index 4acf6cf05a..80a0d17a60 100644
--- a/onnxruntime/core/framework/tensorprotoutils.cc
+++ b/onnxruntime/core/framework/tensorprotoutils.cc
@@ -779,7 +779,7 @@ ONNXTensorElementDataType GetTensorElementType(const ONNX_NAMESPACE::TensorProto
 
 ONNX_NAMESPACE::TensorProto TensorToTensorProto(const Tensor& tensor, const std::string& tensor_proto_name) {
   // Given we are using the raw_data field in the protobuf, this will work only for little-endian format.
-  ORT_IF_CONSTEXPR(endian::native != endian::little) {
+  if constexpr(endian::native != endian::little) {
     ORT_THROW("Big endian not supported");
   }
 
@@ -1127,7 +1127,7 @@ static void SetIndices(gsl::span<int64_t> gathered_indices,
   auto* ind_dest = reinterpret_cast<T*>(raw_indices.data());
   size_t dest_index = 0;
   for (auto src_index : gathered_indices) {
-    ORT_IF_CONSTEXPR(sizeof(T) == sizeof(int8_t)) {
+    if constexpr(sizeof(T) == sizeof(int8_t)) {
       ind_dest[dest_index] = static_cast<T>(src_index);
     }
     else {
diff --git a/onnxruntime/core/providers/cpu/ml/tree_ensemble_classifier.cc b/onnxruntime/core/providers/cpu/ml/tree_ensemble_classifier.cc
index 9a82e9ca76..addbd81244 100644
--- a/onnxruntime/core/providers/cpu/ml/tree_ensemble_classifier.cc
+++ b/onnxruntime/core/providers/cpu/ml/tree_ensemble_classifier.cc
@@ -32,7 +32,7 @@ ADD_IN_TYPE_TREE_ENSEMBLE_CLASSIFIER_OP(int32_t);
 
 template <typename T>
 TreeEnsembleClassifier<T>::TreeEnsembleClassifier(const OpKernelInfo& info) : OpKernel(info) {
-  ORT_IF_CONSTEXPR(std::is_same<T, double>::value) {
+  if constexpr(std::is_same<T, double>::value) {
     p_tree_ensemble_ = std::make_unique<detail::TreeEnsembleCommonClassifier<T, double, OutputType>>();
   }
   else {
diff --git a/onnxruntime/core/providers/cpu/ml/treeregressor.cc b/onnxruntime/core/providers/cpu/ml/treeregressor.cc
index d744c6ce70..7ea0d70ef8 100644
--- a/onnxruntime/core/providers/cpu/ml/treeregressor.cc
+++ b/onnxruntime/core/providers/cpu/ml/treeregressor.cc
@@ -38,7 +38,7 @@ ONNX_CPU_OPERATOR_TYPED_ML_KERNEL(
 
 template <typename T>
 TreeEnsembleRegressor<T>::TreeEnsembleRegressor(const OpKernelInfo& info) : OpKernel(info) {
-  ORT_IF_CONSTEXPR(std::is_same<T, double>::value) {
+  if constexpr(std::is_same<T, double>::value) {
     p_tree_ensemble_ = std::make_unique<detail::TreeEnsembleCommon<T, double, OutputType>>();
   }
   else {
diff --git a/onnxruntime/core/providers/cuda/nn/conv.cc b/onnxruntime/core/providers/cuda/nn/conv.cc
index 8b0ddc0311..fd0d15640f 100644
--- a/onnxruntime/core/providers/cuda/nn/conv.cc
+++ b/onnxruntime/core/providers/cuda/nn/conv.cc
@@ -257,7 +257,7 @@ Status Conv<T>::UpdateState(OpKernelContext* context, bool bias_expected) const
 
     if (!s_.cached_benchmark_results.contains(x_dims_cudnn)) {
       // set math type to tensor core before algorithm search
-      ORT_IF_CONSTEXPR(std::is_same<T, MLFloat16>::value)
+      if constexpr(std::is_same<T, MLFloat16>::value)
         CUDNN_RETURN_IF_ERROR(cudnnSetConvolutionMathType(s_.conv_desc, CUDNN_TENSOR_OP_MATH));
 
       cudnnConvolutionFwdAlgoPerf_t perf;
diff --git a/onnxruntime/core/providers/cuda/reduction/reduction_ops.cc b/onnxruntime/core/providers/cuda/reduction/reduction_ops.cc
index 3ef51d4bf8..f318ee913f 100644
--- a/onnxruntime/core/providers/cuda/reduction/reduction_ops.cc
+++ b/onnxruntime/core/providers/cuda/reduction/reduction_ops.cc
@@ -205,7 +205,7 @@ Status ReduceKernel<allow_multi_axes>::ReduceKernelShared(
   }
 
   CudnnReduceDescriptor reduce_desc;
-  ORT_IF_CONSTEXPR (std::is_same<T, MLFloat16>::value)
+  if constexpr (std::is_same<T, MLFloat16>::value)
     ORT_RETURN_IF_ERROR(reduce_desc.Set(cudnn_reduce_op, CudnnTensor::GetDataType<float>(), ReduceTensorIndices));
   else
     ORT_RETURN_IF_ERROR(reduce_desc.Set(cudnn_reduce_op, cudnn_type_X, ReduceTensorIndices));
@@ -524,7 +524,7 @@ Status ReduceComputeCore(CUDAExecutionProvider& cuda_ep, const Tensor& input, Pr
   }
 
   CudnnReduceDescriptor reduce_desc;
-  ORT_IF_CONSTEXPR (std::is_same<T, MLFloat16>::value || std::is_same<T, BFloat16>::value) {
+  if constexpr (std::is_same<T, MLFloat16>::value || std::is_same<T, BFloat16>::value) {
     ORT_RETURN_IF_ERROR(reduce_desc.Set(cudnn_reduce_op, CudnnTensor::GetDataType<float>(), ReduceTensorIndices));
   } else {
     ORT_RETURN_IF_ERROR(reduce_desc.Set(cudnn_reduce_op, cudnn_type_X, ReduceTensorIndices));
diff --git a/onnxruntime/core/providers/rocm/reduction/reduction_ops.cc b/onnxruntime/core/providers/rocm/reduction/reduction_ops.cc
index 7f796e858a..9112156871 100644
--- a/onnxruntime/core/providers/rocm/reduction/reduction_ops.cc
+++ b/onnxruntime/core/providers/rocm/reduction/reduction_ops.cc
@@ -202,7 +202,7 @@ Status ReduceKernel<allow_multi_axes>::ReduceKernelShared(
   }
 
   MiopenReduceDescriptor reduce_desc;
-  ORT_IF_CONSTEXPR(std::is_same<T, MLFloat16>::value)
+  if constexpr(std::is_same<T, MLFloat16>::value)
   ORT_RETURN_IF_ERROR(reduce_desc.Set(miopen_reduce_op, MiopenTensor::GetDataType<float>(), ReduceTensorIndices));
   else ORT_RETURN_IF_ERROR(reduce_desc.Set(miopen_reduce_op, miopen_type_X, ReduceTensorIndices));
 
@@ -523,7 +523,7 @@ Status ReduceComputeCore(ROCMExecutionProvider& rocm_ep, const Tensor& input, Pr
   }
 
   MiopenReduceDescriptor reduce_desc;
-  ORT_IF_CONSTEXPR (std::is_same<T, MLFloat16>::value || std::is_same<T, BFloat16>::value) {
+  if constexpr (std::is_same<T, MLFloat16>::value || std::is_same<T, BFloat16>::value) {
     ORT_RETURN_IF_ERROR(reduce_desc.Set(miopen_reduce_op, MiopenTensor::GetDataType<float>(), ReduceTensorIndices));
   } else {
     ORT_RETURN_IF_ERROR(reduce_desc.Set(miopen_reduce_op, miopen_type_X, ReduceTensorIndices));
diff --git a/onnxruntime/test/framework/sparse_kernels_test.cc b/onnxruntime/test/framework/sparse_kernels_test.cc
index 54cb51cff2..a049af3edc 100644
--- a/onnxruntime/test/framework/sparse_kernels_test.cc
+++ b/onnxruntime/test/framework/sparse_kernels_test.cc
@@ -698,7 +698,7 @@ struct InsertIndices {
     std::vector<int8_t> indices_data;
     insert_indices_data(indices_1D, values_size, shape_size, indices_data, indices_tp);
     indices_tp.set_data_type(utils::ToTensorProtoElementType<T>());
-    ORT_IF_CONSTEXPR(sizeof(T) == sizeof(int8_t)) {
+    if constexpr(sizeof(T) == sizeof(int8_t)) {
       indices_tp.mutable_raw_data()->assign(reinterpret_cast<const char*>(indices_data.data()), indices_data.size());
     }
     else {
diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-cuda-11-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-cuda-11-pipeline.yml
index 809a72087e..90a40f1b66 100644
--- a/tools/ci_build/github/azure-pipelines/linux-gpu-cuda-11-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-gpu-cuda-11-pipeline.yml
@@ -1,5 +1,3 @@
-#This file is for CUDA 10.2 and 11.4 even the filename just says 11
-
 resources:
   repositories:
   - repository: manylinux # The name used to reference this repository in the checkout step
@@ -9,63 +7,6 @@ resources:
     ref: a8099af1b3e25f0489717ad9c4f9a2e25a8c5b36
 
 jobs:
-- job: Linux_Build_CUDA10_NV6
-  timeoutInMinutes: 180
-  workspace:
-    clean: all
-  pool: Onnxruntime-Linux-GPU-NV6
-  steps:
-  - checkout: self
-    clean: true
-    submodules: recursive
-
-  - template: templates/get-docker-image-steps.yml
-    parameters:
-      Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_cuda11
-      Context: tools/ci_build/github/linux/docker
-      DockerBuildArgs: "--network=host --build-arg POLICY=manylinux2014 --build-arg PLATFORM=x86_64 --build-arg BASEIMAGE=nvcr.io/nvidia/cuda:10.2-cudnn8-devel-centos7 --build-arg DEVTOOLSET_ROOTPATH=/opt/rh/devtoolset-8/root --build-arg PREPEND_PATH=/opt/rh/devtoolset-8/root/usr/bin: --build-arg LD_LIBRARY_PATH_ARG=/opt/rh/devtoolset-8/root/usr/lib64:/opt/rh/devtoolset-8/root/usr/lib:/opt/rh/devtoolset-8/root/usr/lib64/dyninst:/opt/rh/devtoolset-8/root/usr/lib/dyninst:/usr/local/lib64 --build-arg BUILD_UID=$( id -u )"
-      Repository: onnxruntimegpubuild
-  - task: CmdLine@2
-    inputs:
-      script: |
-        mkdir -p $HOME/.onnx
-        docker run --gpus all -e CC=/opt/rh/devtoolset-8/root/usr/bin/cc -e CXX=/opt/rh/devtoolset-8/root/usr/bin/c++ -e CFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" -e CXXFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" --rm \
-          --volume /data/onnx:/data/onnx:ro \
-          --volume $(Build.SourcesDirectory):/onnxruntime_src \
-          --volume $(Build.BinariesDirectory):/build \
-          --volume /data/models:/build/models:ro \
-          --volume $HOME/.onnx:/home/onnxruntimedev/.onnx \
-          -e ALLOW_RELEASED_ONNX_OPSET_ONLY=0 \
-          -e NIGHTLY_BUILD \
-          -e BUILD_BUILDNUMBER \
-          onnxruntimegpubuild \
-            /opt/python/cp37-cp37m/bin/python3 /onnxruntime_src/tools/ci_build/build.py \
-              --build_dir /build --cmake_generator Ninja \
-              --config Release \
-              --skip_submodule_sync \
-              --build_shared_lib \
-              --parallel \
-              --build_wheel \
-              --enable_onnx_tests --use_cuda --cuda_version=10.2 --cuda_home=/usr/local/cuda-10.2 --cudnn_home=/usr/local/cuda-10.2 \
-              --enable_pybind --build_java --build_nodejs \
-              --cmake_extra_defines CMAKE_CUDA_HOST_COMPILER=/opt/rh/devtoolset-8/root/usr/bin/cc  CMAKE_CUDA_ARCHITECTURES=52
-      workingDirectory: $(Build.SourcesDirectory)
-  - task: PublishTestResults@2
-    displayName: 'Publish unit test results'
-    inputs:
-      testResultsFiles: '**/*.results.xml'
-      searchFolder: '$(Build.BinariesDirectory)'
-      testRunTitle: 'Unit Test Run'
-    condition: succeededOrFailed()
-
-  - template: templates/component-governance-component-detection-steps.yml
-    parameters:
-      condition: 'succeeded'
-
-  - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
-    displayName: 'Clean Agent Directories'
-    condition: always()
-    
 - job: Linux_Build_CUDA11_NV6
   timeoutInMinutes: 180
   workspace:
diff --git a/tools/ci_build/github/azure-pipelines/win-gpu-reduce-op-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-gpu-reduce-op-ci-pipeline.yml
index 7dda81a09c..bdb28c9cba 100644
--- a/tools/ci_build/github/azure-pipelines/win-gpu-reduce-op-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/win-gpu-reduce-op-ci-pipeline.yml
@@ -13,7 +13,7 @@ jobs:
     MsbuildArguments: '-detailedsummary -maxcpucount -consoleloggerparameters:PerformanceSummary'
     OnnxRuntimeBuildDirectory: '$(Build.BinariesDirectory)'
     DOTNET_SKIP_FIRST_TIME_EXPERIENCE: true
-    EnvSetupScript: setup_env_cuda.bat
+    EnvSetupScript: setup_env_cuda_11.bat
     buildArch: x64
     setVcvars: true
   timeoutInMinutes: 120
@@ -52,7 +52,7 @@ jobs:
     displayName: 'Build and test'
     inputs:
       scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py'
-      arguments: '--config $(BuildConfig) --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --cmake_generator "Visual Studio 16 2019" --build_wheel --use_cuda --cuda_version=10.2 --cuda_home="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2" --cudnn_home="C:\local\cudnn-10.2-windows10-x64-v8.0.3.33\cuda" --cmake_extra_defines CMAKE_SYSTEM_VERSION=10.0.18362.0 "CMAKE_CUDA_ARCHITECTURES=52" --include_ops_by_config="$(Build.SourcesDirectory)\onnxruntime\test\testdata\required_ops.config"'
+      arguments: '--config $(BuildConfig) --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --cmake_generator "Visual Studio 16 2019" --build_wheel --use_cuda --cuda_version=11.4 --cuda_home="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.4" --cudnn_home="C:\local\cudnn-11.4-windows-x64-v8.2.2.26\cuda" --cmake_extra_defines CMAKE_SYSTEM_VERSION=10.0.18362.0 "CMAKE_CUDA_ARCHITECTURES=52" --include_ops_by_config="$(Build.SourcesDirectory)\onnxruntime\test\testdata\required_ops.config"'
       workingDirectory: '$(Build.BinariesDirectory)'
 
   - template: templates/component-governance-component-detection-steps.yml
diff --git a/tools/ci_build/github/windows/setup_env_cuda.bat b/tools/ci_build/github/windows/setup_env_cuda.bat
deleted file mode 100644
index 5af5e502f2..0000000000
--- a/tools/ci_build/github/windows/setup_env_cuda.bat
+++ /dev/null
@@ -1,2 +0,0 @@
-set PATH=C:\azcopy;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2\extras\CUPTI\lib64;%PATH%
-set GRADLE_OPTS=-Dorg.gradle.daemon=false