From 5c3c7643db6c160a4543dd526335acf8f79a6209 Mon Sep 17 00:00:00 2001 From: Yifan Li <109183385+yf711@users.noreply.github.com> Date: Tue, 14 Jan 2025 14:27:34 -0800 Subject: [PATCH] Update range of gpu arch (#23309) ### Description * Remove deprecated gpu arch to control nuget/python package size (latest TRT supports sm75 Turing and newer arch) * Add 90 to support blackwell series in next release (86;89 not considered as adding them will rapidly increase package size) | arch_range | Python-cuda12 | Nuget-cuda12 | | -------------- | ------------------------------------------------------------ | ---------------------------------- | | 60;61;70;75;80 | Linux: 279MB Win: 267MB | Linux: 247MB Win: 235MB | | 75;80 | Linux: 174MB Win: 162MB | Linux: 168MB Win: 156MB | | **75;80;90** | **Linux: 299MB Win: 277MB** | **Linux: 294MB Win: 271MB** | | 75;80;86;89 | [Linux: MB Win: 390MB](https://aiinfra.visualstudio.com/Lotus/_build/results?buildId=647457&view=results) | Linux: 416MB Win: 383MB | | 75;80;86;89;90 | [Linux: MB Win: 505MB](https://aiinfra.visualstudio.com/Lotus/_build/results?buildId=646536&view=results) | Linux: 541MB Win: 498MB | ### Motivation and Context Callout: While adding sm90 support, the build of cuda11.8+cudnn8 will be dropped in the coming ORT release, as the build has issue with blackwell (mentioned in comments) and demand on cuda 11 is minor, according to internal ort-cuda11 repo. --- dockerfiles/Dockerfile.cuda | 4 +++- dockerfiles/Dockerfile.tensorrt | 4 +++- .../stages/nuget-win-cuda-packaging-stage.yml | 4 ++-- .../azure-pipelines/stages/py-gpu-packaging-stage.yml | 2 +- .../templates/py-packaging-selectable-stage.yml | 6 +++--- tools/ci_build/github/linux/build_cuda_c_api_package.sh | 2 +- tools/ci_build/github/linux/build_linux_python_package.sh | 2 +- tools/ci_build/github/linux/build_tensorrt_c_api_package.sh | 2 +- 8 files changed, 15 insertions(+), 11 deletions(-) diff --git a/dockerfiles/Dockerfile.cuda b/dockerfiles/Dockerfile.cuda index 40f11dca62..db1a92a27a 100644 --- a/dockerfiles/Dockerfile.cuda +++ b/dockerfiles/Dockerfile.cuda @@ -12,7 +12,9 @@ ARG OS=ubuntu24.04 FROM nvcr.io/nvidia/cuda:${CUDA_VERSION}-devel-${OS} ARG CUDA_VERSION ARG CUDNN_VERSION -ARG CMAKE_CUDA_ARCHITECTURES="61;70;75;80;86;90" +# Adjust as needed +# Check your CUDA arch: https://developer.nvidia.com/cuda-gpus +ARG CMAKE_CUDA_ARCHITECTURES="75;80;90" ENV DEBIAN_FRONTEND=noninteractive diff --git a/dockerfiles/Dockerfile.tensorrt b/dockerfiles/Dockerfile.tensorrt index 24947df630..c1fdf73e0f 100644 --- a/dockerfiles/Dockerfile.tensorrt +++ b/dockerfiles/Dockerfile.tensorrt @@ -10,7 +10,9 @@ FROM nvcr.io/nvidia/tensorrt:${TRT_CONTAINER_VERSION}-py3 ARG ONNXRUNTIME_REPO=https://github.com/Microsoft/onnxruntime ARG ONNXRUNTIME_BRANCH=main -ARG CMAKE_CUDA_ARCHITECTURES=37;50;52;60;61;70;75;80 +# Adjust as needed +# Check your CUDA arch: https://developer.nvidia.com/cuda-gpus +ARG CMAKE_CUDA_ARCHITECTURES=75;80;90 RUN apt-get update &&\ apt-get install -y sudo git bash unattended-upgrades diff --git a/tools/ci_build/github/azure-pipelines/stages/nuget-win-cuda-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/nuget-win-cuda-packaging-stage.yml index 75da33722e..095ac9e708 100644 --- a/tools/ci_build/github/azure-pipelines/stages/nuget-win-cuda-packaging-stage.yml +++ b/tools/ci_build/github/azure-pipelines/stages/nuget-win-cuda-packaging-stage.yml @@ -50,7 +50,7 @@ stages: msbuildPlatform: x64 packageName: x64-cuda CudaVersion: ${{ parameters.CudaVersion }} - buildparameter: --use_cuda --cuda_home=${{ parameters.win_cuda_home }} --enable_onnx_tests --enable_wcos --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=60;61;70;75;80" + buildparameter: --use_cuda --cuda_home=${{ parameters.win_cuda_home }} --enable_onnx_tests --enable_wcos --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=75;80;90" runTests: ${{ parameters.RunOnnxRuntimeTests }} buildJava: ${{ parameters.buildJava }} java_artifact_id: onnxruntime_gpu @@ -68,7 +68,7 @@ stages: msbuildPlatform: x64 CudaVersion: ${{ parameters.CudaVersion }} packageName: x64-tensorrt - buildparameter: --use_tensorrt --tensorrt_home=${{ parameters.win_trt_home }} --cuda_home=${{ parameters.win_cuda_home }} --enable_onnx_tests --enable_wcos --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=60;61;70;75;80" + buildparameter: --use_tensorrt --tensorrt_home=${{ parameters.win_trt_home }} --cuda_home=${{ parameters.win_cuda_home }} --enable_onnx_tests --enable_wcos --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=75;80;90" runTests: ${{ parameters.RunOnnxRuntimeTests }} buildJava: ${{ parameters.buildJava }} java_artifact_id: onnxruntime_gpu diff --git a/tools/ci_build/github/azure-pipelines/stages/py-gpu-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/py-gpu-packaging-stage.yml index 45c0eea66b..a3c804055d 100644 --- a/tools/ci_build/github/azure-pipelines/stages/py-gpu-packaging-stage.yml +++ b/tools/ci_build/github/azure-pipelines/stages/py-gpu-packaging-stage.yml @@ -56,7 +56,7 @@ stages: PYTHON_VERSION: ${{ python_version }} EP_NAME: gpu CudaVersion: ${{ parameters.cuda_version }} - EP_BUILD_FLAGS: --enable_lto --cuda_home=$(Agent.TempDirectory)\v${{ parameters.cuda_version }} --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80" + EP_BUILD_FLAGS: --enable_lto --cuda_home=$(Agent.TempDirectory)\v${{ parameters.cuda_version }} --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=75;80;90" use_tensorrt: True - ${{ if eq(parameters.enable_linux_cuda, true) }}: diff --git a/tools/ci_build/github/azure-pipelines/templates/py-packaging-selectable-stage.yml b/tools/ci_build/github/azure-pipelines/templates/py-packaging-selectable-stage.yml index 2485bd0e8e..b4908fd1ae 100644 --- a/tools/ci_build/github/azure-pipelines/templates/py-packaging-selectable-stage.yml +++ b/tools/ci_build/github/azure-pipelines/templates/py-packaging-selectable-stage.yml @@ -319,7 +319,7 @@ stages: --build_wheel \ --enable_onnx_tests --use_tensorrt --cuda_version=11.8 --tensorrt_home=/usr --cuda_home=/usr/local/cuda-11.8 --cudnn_home=/usr/local/cuda-11.8 \ ${{ parameters.gpu_build_py_parameters }} \ - --cmake_extra_defines CMAKE_CUDA_HOST_COMPILER=/opt/rh/devtoolset-11/root/usr/bin/cc 'CMAKE_CUDA_ARCHITECTURES=37;50;52;60;61;70;75;80' + --cmake_extra_defines CMAKE_CUDA_HOST_COMPILER=/opt/rh/devtoolset-11/root/usr/bin/cc 'CMAKE_CUDA_ARCHITECTURES=75;80;90' workingDirectory: $(Build.SourcesDirectory) - task: CmdLine@2 @@ -349,7 +349,7 @@ stages: --build_wheel \ --enable_onnx_tests --use_tensorrt --cuda_version=11.8 --tensorrt_home=/usr --cuda_home=/usr/local/cuda-11.8 --cudnn_home=/usr/local/cuda-11.8 \ ${{ parameters.gpu_build_py_parameters }} --ctest_path '' \ - --cmake_extra_defines CMAKE_CUDA_HOST_COMPILER=/opt/rh/devtoolset-11/root/usr/bin/cc 'CMAKE_CUDA_ARCHITECTURES=37;50;52;60;61;70;75;80' + --cmake_extra_defines CMAKE_CUDA_HOST_COMPILER=/opt/rh/devtoolset-11/root/usr/bin/cc 'CMAKE_CUDA_ARCHITECTURES=75;80;90' - task: CopyFiles@2 displayName: 'Copy Python Wheel to: $(Build.ArtifactStagingDirectory)' @@ -379,7 +379,7 @@ stages: - template: common-variables.yml CUDA_VERSION: '11.8' buildArch: x64 - EpBuildFlags: --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\${{ variables.win_trt_folder_cuda11 }}" --cuda_version=$(CUDA_VERSION) --cuda_home="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v$(CUDA_VERSION)" --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=37;50;52;60;61;70;75;80" + EpBuildFlags: --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\${{ variables.win_trt_folder_cuda11 }}" --cuda_version=$(CUDA_VERSION) --cuda_home="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v$(CUDA_VERSION)" --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=75;80;90" EnvSetupScript: setup_env_gpu.bat EP_NAME: gpu VSGenerator: 'Visual Studio 17 2022' diff --git a/tools/ci_build/github/linux/build_cuda_c_api_package.sh b/tools/ci_build/github/linux/build_cuda_c_api_package.sh index 9922fc396b..8a8d02936e 100755 --- a/tools/ci_build/github/linux/build_cuda_c_api_package.sh +++ b/tools/ci_build/github/linux/build_cuda_c_api_package.sh @@ -2,4 +2,4 @@ set -e -x docker run --rm --volume \ $BUILD_SOURCESDIRECTORY:/onnxruntime_src --volume $BUILD_BINARIESDIRECTORY:/build -e NIGHTLY_BUILD onnxruntimecuda${CUDA_VERSION_MAJOR}build \ -/bin/bash -c "/usr/bin/python3.12 /onnxruntime_src/tools/ci_build/build.py --enable_lto --build_java --build_nodejs --build_dir /build --config Release --skip_submodule_sync --parallel --use_binskim_compliant_compile_flags --build_shared_lib --use_cuda --cuda_version=$CUDA_VERSION --cuda_home=/usr/local/cuda-$CUDA_VERSION --cudnn_home=/usr/local/cuda-$CUDA_VERSION --skip_tests --cmake_extra_defines 'CMAKE_CUDA_ARCHITECTURES=60;61;70;75;80' && cd /build/Release && make install DESTDIR=/build/installed" +/bin/bash -c "/usr/bin/python3.12 /onnxruntime_src/tools/ci_build/build.py --enable_lto --build_java --build_nodejs --build_dir /build --config Release --skip_submodule_sync --parallel --use_binskim_compliant_compile_flags --build_shared_lib --use_cuda --cuda_version=$CUDA_VERSION --cuda_home=/usr/local/cuda-$CUDA_VERSION --cudnn_home=/usr/local/cuda-$CUDA_VERSION --skip_tests --cmake_extra_defines 'CMAKE_CUDA_ARCHITECTURES=75;80;90' && cd /build/Release && make install DESTDIR=/build/installed" diff --git a/tools/ci_build/github/linux/build_linux_python_package.sh b/tools/ci_build/github/linux/build_linux_python_package.sh index 0dd435f8db..6d86a57bd7 100755 --- a/tools/ci_build/github/linux/build_linux_python_package.sh +++ b/tools/ci_build/github/linux/build_linux_python_package.sh @@ -70,7 +70,7 @@ fi if [ "$BUILD_DEVICE" == "GPU" ]; then SHORT_CUDA_VERSION=$(echo $CUDA_VERSION | sed 's/\([[:digit:]]\+\.[[:digit:]]\+\)\.[[:digit:]]\+/\1/') #Enable CUDA and TRT EPs. - BUILD_ARGS+=("--use_cuda" "--use_tensorrt" "--cuda_version=$SHORT_CUDA_VERSION" "--tensorrt_home=/usr" "--cuda_home=/usr/local/cuda-$SHORT_CUDA_VERSION" "--cudnn_home=/usr/local/cuda-$SHORT_CUDA_VERSION" "--cmake_extra_defines" "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80") + BUILD_ARGS+=("--use_cuda" "--use_tensorrt" "--cuda_version=$SHORT_CUDA_VERSION" "--tensorrt_home=/usr" "--cuda_home=/usr/local/cuda-$SHORT_CUDA_VERSION" "--cudnn_home=/usr/local/cuda-$SHORT_CUDA_VERSION" "--cmake_extra_defines" "CMAKE_CUDA_ARCHITECTURES=75;80;90") fi if [ "$BUILD_DEVICE" == "NPU" ]; then diff --git a/tools/ci_build/github/linux/build_tensorrt_c_api_package.sh b/tools/ci_build/github/linux/build_tensorrt_c_api_package.sh index 7f18e2f849..ce6f0929c0 100755 --- a/tools/ci_build/github/linux/build_tensorrt_c_api_package.sh +++ b/tools/ci_build/github/linux/build_tensorrt_c_api_package.sh @@ -3,4 +3,4 @@ set -e -x mkdir -p $HOME/.onnx docker run --rm --volume /data/onnx:/data/onnx:ro --volume $BUILD_SOURCESDIRECTORY:/onnxruntime_src --volume $BUILD_BINARIESDIRECTORY:/build \ --volume /data/models:/build/models:ro --volume $HOME/.onnx:/home/onnxruntimedev/.onnx -e NIGHTLY_BUILD onnxruntimecuda${CUDA_VERSION_MAJOR}xtrt86build \ -/bin/bash -c "/usr/bin/python3.12 /onnxruntime_src/tools/ci_build/build.py --build_dir /build --config Release --skip_tests --skip_submodule_sync --parallel --use_binskim_compliant_compile_flags --build_shared_lib --build_java --build_nodejs --use_tensorrt --cuda_version=$CUDA_VERSION --cuda_home=/usr/local/cuda-$CUDA_VERSION --cudnn_home=/usr --tensorrt_home=/usr --cmake_extra_defines 'CMAKE_CUDA_ARCHITECTURES=60;61;70;75;80' && cd /build/Release && make install DESTDIR=/build/installed" +/bin/bash -c "/usr/bin/python3.12 /onnxruntime_src/tools/ci_build/build.py --build_dir /build --config Release --skip_tests --skip_submodule_sync --parallel --use_binskim_compliant_compile_flags --build_shared_lib --build_java --build_nodejs --use_tensorrt --cuda_version=$CUDA_VERSION --cuda_home=/usr/local/cuda-$CUDA_VERSION --cudnn_home=/usr --tensorrt_home=/usr --cmake_extra_defines 'CMAKE_CUDA_ARCHITECTURES=75;80;90' && cd /build/Release && make install DESTDIR=/build/installed"