Remove CUDA 10.2 support (#12541)

This commit is contained in:
Changming Sun 2022-08-10 22:46:41 -07:00 committed by GitHub
parent 819c36701f
commit ac7538b909
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
17 changed files with 23 additions and 95 deletions

View file

@ -1245,6 +1245,9 @@ endif()
function(onnxruntime_set_compile_flags target_name)
target_compile_definitions(${target_name} PUBLIC EIGEN_USE_THREADS)
if (MSVC)
foreach(CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORY ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
target_compile_options(${target_name} PRIVATE "$<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:/external:I${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORY}>")
endforeach()
target_compile_definitions(${target_name} PUBLIC -DPLATFORM_WINDOWS -DNOGDI -DNOMINMAX -D_USE_MATH_DEFINES -D_SILENCE_ALL_CXX17_DEPRECATION_WARNINGS)
if (onnxruntime_ENABLE_MEMLEAK_CHECKER)
target_compile_definitions(${target_name} PUBLIC -DONNXRUNTIME_ENABLE_MEMLEAK_CHECK)
@ -1794,15 +1797,7 @@ if (onnxruntime_USE_CUDA)
set(CMAKE_CUDA_RUNTIME_LIBRARY Shared)
enable_language(CUDA)
message( STATUS "CMAKE_CUDA_COMPILER_VERSION: ${CMAKE_CUDA_COMPILER_VERSION}")
if (WIN32)
set(CMAKE_CUDA_STANDARD 17)
foreach(CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORY ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
string(APPEND CMAKE_CXX_FLAGS " /external:I\"${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORY}\"")
endforeach()
else()
#CUDA 10.2 on Linux doesn't support C++17
set(CMAKE_CUDA_STANDARD 14)
endif()
set(CMAKE_CUDA_STANDARD 17)
file(TO_CMAKE_PATH ${onnxruntime_CUDNN_HOME} onnxruntime_CUDNN_HOME)
if (NOT CMAKE_CUDA_ARCHITECTURES)

View file

@ -278,12 +278,6 @@ inline std::wstring ToWideString(const std::wstring& s) { return s; }
inline std::string ToWideString(const std::string& s) { return s; }
#endif
#if ((__cplusplus >= 201703L) || (defined(_MSVC_LANG) && (_MSVC_LANG >= 201703L)))
#define ORT_IF_CONSTEXPR if constexpr
#else
#define ORT_IF_CONSTEXPR if
#endif
constexpr size_t kMaxStrLen = 2048;
// Returns whether `key` is in `container`.

View file

@ -16,7 +16,7 @@ namespace onnxruntime {
*/
template <typename T>
bool TryParseStringWithClassicLocale(const std::string& str, T& value) {
ORT_IF_CONSTEXPR (std::is_integral<T>::value && std::is_unsigned<T>::value) {
if constexpr (std::is_integral<T>::value && std::is_unsigned<T>::value) {
// if T is unsigned integral type, reject negative values which will wrap
if (!str.empty() && str[0] == '-') {
return false;

View file

@ -146,7 +146,7 @@ class IAllocator {
size_t alloc_size = count_or_bytes;
// if T is not void, 'count_or_bytes' == number of items so allow for that
ORT_IF_CONSTEXPR(!std::is_void<T>::value) {
if constexpr(!std::is_void<T>::value) {
// sizeof(void) isn't valid, but the compiler isn't smart enough to ignore that this line isn't
// reachable if T is void. use std::conditional to 'use' void* in the sizeof call
if (!CalcMemSizeForArray(

View file

@ -498,7 +498,7 @@ class ContainerChecker {
ORT_ENFORCE(++index < c.size(), "Sequence is missing type entry for its element");
constexpr int32_t prim_type = ToTensorProtoElementType<T>();
// Check if this is a primitive type and it matches
ORT_IF_CONSTEXPR(prim_type != ONNX_NAMESPACE::TensorProto_DataType_UNDEFINED) {
if constexpr(prim_type != ONNX_NAMESPACE::TensorProto_DataType_UNDEFINED) {
return c[index].IsType(data_types_internal::ContainerType::kTensor) &&
c[index].IsPrimType(prim_type);
}
@ -528,7 +528,7 @@ class ContainerChecker {
}
ORT_ENFORCE(++index < c.size(), "Map is missing type entry for its value");
constexpr int32_t val_type = ToTensorProtoElementType<V>();
ORT_IF_CONSTEXPR(val_type != ONNX_NAMESPACE::TensorProto_DataType_UNDEFINED) {
if constexpr(val_type != ONNX_NAMESPACE::TensorProto_DataType_UNDEFINED) {
return c[index].IsType(data_types_internal::ContainerType::kTensor) &&
c[index].IsPrimType(val_type);
}

View file

@ -69,7 +69,7 @@ struct BFloat16 {
val = static_cast<uint16_t>((U32 + rounding_bias) >> 16);
}
#else
ORT_IF_CONSTEXPR(endian::native == endian::little) {
if constexpr(endian::native == endian::little) {
std::memcpy(&val, reinterpret_cast<char*>(&v) + sizeof(uint16_t), sizeof(uint16_t));
}
else {
@ -93,7 +93,7 @@ struct BFloat16 {
float result;
char* const first = reinterpret_cast<char*>(&result);
char* const second = first + sizeof(uint16_t);
ORT_IF_CONSTEXPR(endian::native == endian::little) {
if constexpr(endian::native == endian::little) {
std::memset(first, 0, sizeof(uint16_t));
std::memcpy(second, &val, sizeof(uint16_t));
}

View file

@ -56,7 +56,7 @@ Status CopyLittleEndian(size_t element_size_in_bytes,
ORT_RETURN_IF(source_bytes.size_bytes() != destination_bytes.size_bytes(),
"source and destination buffer size mismatch");
ORT_IF_CONSTEXPR (endian::native == endian::little) {
if constexpr (endian::native == endian::little) {
std::memcpy(destination_bytes.data(), source_bytes.data(), source_bytes.size_bytes());
} else {
SwapByteOrderCopy(element_size_in_bytes, source_bytes, destination_bytes);

View file

@ -779,7 +779,7 @@ ONNXTensorElementDataType GetTensorElementType(const ONNX_NAMESPACE::TensorProto
ONNX_NAMESPACE::TensorProto TensorToTensorProto(const Tensor& tensor, const std::string& tensor_proto_name) {
// Given we are using the raw_data field in the protobuf, this will work only for little-endian format.
ORT_IF_CONSTEXPR(endian::native != endian::little) {
if constexpr(endian::native != endian::little) {
ORT_THROW("Big endian not supported");
}
@ -1127,7 +1127,7 @@ static void SetIndices(gsl::span<int64_t> gathered_indices,
auto* ind_dest = reinterpret_cast<T*>(raw_indices.data());
size_t dest_index = 0;
for (auto src_index : gathered_indices) {
ORT_IF_CONSTEXPR(sizeof(T) == sizeof(int8_t)) {
if constexpr(sizeof(T) == sizeof(int8_t)) {
ind_dest[dest_index] = static_cast<T>(src_index);
}
else {

View file

@ -32,7 +32,7 @@ ADD_IN_TYPE_TREE_ENSEMBLE_CLASSIFIER_OP(int32_t);
template <typename T>
TreeEnsembleClassifier<T>::TreeEnsembleClassifier(const OpKernelInfo& info) : OpKernel(info) {
ORT_IF_CONSTEXPR(std::is_same<T, double>::value) {
if constexpr(std::is_same<T, double>::value) {
p_tree_ensemble_ = std::make_unique<detail::TreeEnsembleCommonClassifier<T, double, OutputType>>();
}
else {

View file

@ -38,7 +38,7 @@ ONNX_CPU_OPERATOR_TYPED_ML_KERNEL(
template <typename T>
TreeEnsembleRegressor<T>::TreeEnsembleRegressor(const OpKernelInfo& info) : OpKernel(info) {
ORT_IF_CONSTEXPR(std::is_same<T, double>::value) {
if constexpr(std::is_same<T, double>::value) {
p_tree_ensemble_ = std::make_unique<detail::TreeEnsembleCommon<T, double, OutputType>>();
}
else {

View file

@ -257,7 +257,7 @@ Status Conv<T>::UpdateState(OpKernelContext* context, bool bias_expected) const
if (!s_.cached_benchmark_results.contains(x_dims_cudnn)) {
// set math type to tensor core before algorithm search
ORT_IF_CONSTEXPR(std::is_same<T, MLFloat16>::value)
if constexpr(std::is_same<T, MLFloat16>::value)
CUDNN_RETURN_IF_ERROR(cudnnSetConvolutionMathType(s_.conv_desc, CUDNN_TENSOR_OP_MATH));
cudnnConvolutionFwdAlgoPerf_t perf;

View file

@ -205,7 +205,7 @@ Status ReduceKernel<allow_multi_axes>::ReduceKernelShared(
}
CudnnReduceDescriptor reduce_desc;
ORT_IF_CONSTEXPR (std::is_same<T, MLFloat16>::value)
if constexpr (std::is_same<T, MLFloat16>::value)
ORT_RETURN_IF_ERROR(reduce_desc.Set(cudnn_reduce_op, CudnnTensor::GetDataType<float>(), ReduceTensorIndices));
else
ORT_RETURN_IF_ERROR(reduce_desc.Set(cudnn_reduce_op, cudnn_type_X, ReduceTensorIndices));
@ -524,7 +524,7 @@ Status ReduceComputeCore(CUDAExecutionProvider& cuda_ep, const Tensor& input, Pr
}
CudnnReduceDescriptor reduce_desc;
ORT_IF_CONSTEXPR (std::is_same<T, MLFloat16>::value || std::is_same<T, BFloat16>::value) {
if constexpr (std::is_same<T, MLFloat16>::value || std::is_same<T, BFloat16>::value) {
ORT_RETURN_IF_ERROR(reduce_desc.Set(cudnn_reduce_op, CudnnTensor::GetDataType<float>(), ReduceTensorIndices));
} else {
ORT_RETURN_IF_ERROR(reduce_desc.Set(cudnn_reduce_op, cudnn_type_X, ReduceTensorIndices));

View file

@ -202,7 +202,7 @@ Status ReduceKernel<allow_multi_axes>::ReduceKernelShared(
}
MiopenReduceDescriptor reduce_desc;
ORT_IF_CONSTEXPR(std::is_same<T, MLFloat16>::value)
if constexpr(std::is_same<T, MLFloat16>::value)
ORT_RETURN_IF_ERROR(reduce_desc.Set(miopen_reduce_op, MiopenTensor::GetDataType<float>(), ReduceTensorIndices));
else ORT_RETURN_IF_ERROR(reduce_desc.Set(miopen_reduce_op, miopen_type_X, ReduceTensorIndices));
@ -523,7 +523,7 @@ Status ReduceComputeCore(ROCMExecutionProvider& rocm_ep, const Tensor& input, Pr
}
MiopenReduceDescriptor reduce_desc;
ORT_IF_CONSTEXPR (std::is_same<T, MLFloat16>::value || std::is_same<T, BFloat16>::value) {
if constexpr (std::is_same<T, MLFloat16>::value || std::is_same<T, BFloat16>::value) {
ORT_RETURN_IF_ERROR(reduce_desc.Set(miopen_reduce_op, MiopenTensor::GetDataType<float>(), ReduceTensorIndices));
} else {
ORT_RETURN_IF_ERROR(reduce_desc.Set(miopen_reduce_op, miopen_type_X, ReduceTensorIndices));

View file

@ -698,7 +698,7 @@ struct InsertIndices {
std::vector<int8_t> indices_data;
insert_indices_data(indices_1D, values_size, shape_size, indices_data, indices_tp);
indices_tp.set_data_type(utils::ToTensorProtoElementType<T>());
ORT_IF_CONSTEXPR(sizeof(T) == sizeof(int8_t)) {
if constexpr(sizeof(T) == sizeof(int8_t)) {
indices_tp.mutable_raw_data()->assign(reinterpret_cast<const char*>(indices_data.data()), indices_data.size());
}
else {

View file

@ -1,5 +1,3 @@
#This file is for CUDA 10.2 and 11.4 even the filename just says 11
resources:
repositories:
- repository: manylinux # The name used to reference this repository in the checkout step
@ -9,63 +7,6 @@ resources:
ref: a8099af1b3e25f0489717ad9c4f9a2e25a8c5b36
jobs:
- job: Linux_Build_CUDA10_NV6
timeoutInMinutes: 180
workspace:
clean: all
pool: Onnxruntime-Linux-GPU-NV6
steps:
- checkout: self
clean: true
submodules: recursive
- template: templates/get-docker-image-steps.yml
parameters:
Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_cuda11
Context: tools/ci_build/github/linux/docker
DockerBuildArgs: "--network=host --build-arg POLICY=manylinux2014 --build-arg PLATFORM=x86_64 --build-arg BASEIMAGE=nvcr.io/nvidia/cuda:10.2-cudnn8-devel-centos7 --build-arg DEVTOOLSET_ROOTPATH=/opt/rh/devtoolset-8/root --build-arg PREPEND_PATH=/opt/rh/devtoolset-8/root/usr/bin: --build-arg LD_LIBRARY_PATH_ARG=/opt/rh/devtoolset-8/root/usr/lib64:/opt/rh/devtoolset-8/root/usr/lib:/opt/rh/devtoolset-8/root/usr/lib64/dyninst:/opt/rh/devtoolset-8/root/usr/lib/dyninst:/usr/local/lib64 --build-arg BUILD_UID=$( id -u )"
Repository: onnxruntimegpubuild
- task: CmdLine@2
inputs:
script: |
mkdir -p $HOME/.onnx
docker run --gpus all -e CC=/opt/rh/devtoolset-8/root/usr/bin/cc -e CXX=/opt/rh/devtoolset-8/root/usr/bin/c++ -e CFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" -e CXXFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" --rm \
--volume /data/onnx:/data/onnx:ro \
--volume $(Build.SourcesDirectory):/onnxruntime_src \
--volume $(Build.BinariesDirectory):/build \
--volume /data/models:/build/models:ro \
--volume $HOME/.onnx:/home/onnxruntimedev/.onnx \
-e ALLOW_RELEASED_ONNX_OPSET_ONLY=0 \
-e NIGHTLY_BUILD \
-e BUILD_BUILDNUMBER \
onnxruntimegpubuild \
/opt/python/cp37-cp37m/bin/python3 /onnxruntime_src/tools/ci_build/build.py \
--build_dir /build --cmake_generator Ninja \
--config Release \
--skip_submodule_sync \
--build_shared_lib \
--parallel \
--build_wheel \
--enable_onnx_tests --use_cuda --cuda_version=10.2 --cuda_home=/usr/local/cuda-10.2 --cudnn_home=/usr/local/cuda-10.2 \
--enable_pybind --build_java --build_nodejs \
--cmake_extra_defines CMAKE_CUDA_HOST_COMPILER=/opt/rh/devtoolset-8/root/usr/bin/cc CMAKE_CUDA_ARCHITECTURES=52
workingDirectory: $(Build.SourcesDirectory)
- task: PublishTestResults@2
displayName: 'Publish unit test results'
inputs:
testResultsFiles: '**/*.results.xml'
searchFolder: '$(Build.BinariesDirectory)'
testRunTitle: 'Unit Test Run'
condition: succeededOrFailed()
- template: templates/component-governance-component-detection-steps.yml
parameters:
condition: 'succeeded'
- task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
displayName: 'Clean Agent Directories'
condition: always()
- job: Linux_Build_CUDA11_NV6
timeoutInMinutes: 180
workspace:

View file

@ -13,7 +13,7 @@ jobs:
MsbuildArguments: '-detailedsummary -maxcpucount -consoleloggerparameters:PerformanceSummary'
OnnxRuntimeBuildDirectory: '$(Build.BinariesDirectory)'
DOTNET_SKIP_FIRST_TIME_EXPERIENCE: true
EnvSetupScript: setup_env_cuda.bat
EnvSetupScript: setup_env_cuda_11.bat
buildArch: x64
setVcvars: true
timeoutInMinutes: 120
@ -52,7 +52,7 @@ jobs:
displayName: 'Build and test'
inputs:
scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py'
arguments: '--config $(BuildConfig) --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --cmake_generator "Visual Studio 16 2019" --build_wheel --use_cuda --cuda_version=10.2 --cuda_home="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2" --cudnn_home="C:\local\cudnn-10.2-windows10-x64-v8.0.3.33\cuda" --cmake_extra_defines CMAKE_SYSTEM_VERSION=10.0.18362.0 "CMAKE_CUDA_ARCHITECTURES=52" --include_ops_by_config="$(Build.SourcesDirectory)\onnxruntime\test\testdata\required_ops.config"'
arguments: '--config $(BuildConfig) --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --cmake_generator "Visual Studio 16 2019" --build_wheel --use_cuda --cuda_version=11.4 --cuda_home="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.4" --cudnn_home="C:\local\cudnn-11.4-windows-x64-v8.2.2.26\cuda" --cmake_extra_defines CMAKE_SYSTEM_VERSION=10.0.18362.0 "CMAKE_CUDA_ARCHITECTURES=52" --include_ops_by_config="$(Build.SourcesDirectory)\onnxruntime\test\testdata\required_ops.config"'
workingDirectory: '$(Build.BinariesDirectory)'
- template: templates/component-governance-component-detection-steps.yml

View file

@ -1,2 +0,0 @@
set PATH=C:\azcopy;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2\extras\CUPTI\lib64;%PATH%
set GRADLE_OPTS=-Dorg.gradle.daemon=false