From 3bb5fb0f90770a128f9055c2bde7f29a79ccbbd8 Mon Sep 17 00:00:00 2001 From: Adam Louly Date: Thu, 25 Aug 2022 22:12:01 -0700 Subject: [PATCH] moving training pipelines from cuda 11.5 to 11.6 and deprecating 11.3 (packaging pipeline) (#12688) * moving training pipelines from cuda 11.5 to 11.6 and deprecating cuda 11.3 * change to cuda 11.6.2 * change pytorch's & torchvision's cuda version to 11.6 * specify deps version to 11.6.2 * update pytorch and torch text version * torch 1.12.1 * change torchvision and torchtext version to be compatible with torch 1.12.1 * change cuda to 11.6 for cuda_home comaptibility Co-authored-by: Adam Louly --- ...training-py-packaging-pipeline-cuda113.yml | 22 --- ...raining-py-packaging-pipeline-cuda116.yml} | 4 +- ...Dockerfile.manylinux2014_training_cuda11_3 | 187 ------------------ ...ockerfile.manylinux2014_training_cuda11_6} | 4 +- .../requirements.txt | 6 - .../requirements.txt | 6 +- 6 files changed, 7 insertions(+), 222 deletions(-) delete mode 100644 tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cuda113.yml rename tools/ci_build/github/azure-pipelines/{orttraining-py-packaging-pipeline-cuda115.yml => orttraining-py-packaging-pipeline-cuda116.yml} (85%) delete mode 100644 tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_training_cuda11_3 rename tools/ci_build/github/linux/docker/{Dockerfile.manylinux2014_training_cuda11_5 => Dockerfile.manylinux2014_training_cuda11_6} (98%) delete mode 100644 tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_torch1.10.0_cu11.1/requirements.txt rename tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/{requirements_torch1.11.0_cu11.5 => requirements_torch1.11.0_cu11.6}/requirements.txt (55%) diff --git a/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cuda113.yml b/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cuda113.yml deleted file mode 100644 index 421d0d377f..0000000000 --- a/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cuda113.yml +++ /dev/null @@ -1,22 +0,0 @@ -trigger: none - -resources: - repositories: - - repository: manylinux - type: Github - endpoint: Microsoft - name: pypa/manylinux - ref: a8099af1b3e25f0489717ad9c4f9a2e25a8c5b36 - -stages: -- template: templates/py-packaging-training-cuda-stage.yml - parameters: - build_py_parameters: --enable_training --update --build - torch_version: '1.11.0' - opset_version: '15' - cuda_version: '11.3' - gcc_version: 10 - cmake_cuda_architectures: 37;50;52;60;61;70;75;80;86 - docker_file: Dockerfile.manylinux2014_training_cuda11_3 - agent_pool: Onnxruntime-Linux-GPU - upload_wheel: 'yes' diff --git a/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cuda115.yml b/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cuda116.yml similarity index 85% rename from tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cuda115.yml rename to tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cuda116.yml index 05ca414826..e6677a9567 100644 --- a/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cuda115.yml +++ b/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cuda116.yml @@ -14,9 +14,9 @@ stages: build_py_parameters: --enable_training --update --build torch_version: '1.11.0' opset_version: '15' - cuda_version: '11.5' + cuda_version: '11.6' gcc_version: 10 cmake_cuda_architectures: 37;50;52;60;61;70;75;80;86;87 - docker_file: Dockerfile.manylinux2014_training_cuda11_5 + docker_file: Dockerfile.manylinux2014_training_cuda11_6 agent_pool: Onnxruntime-Linux-GPU upload_wheel: 'yes' diff --git a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_training_cuda11_3 b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_training_cuda11_3 deleted file mode 100644 index e09d239f5b..0000000000 --- a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_training_cuda11_3 +++ /dev/null @@ -1,187 +0,0 @@ -ARG BASEIMAGE=nvcr.io/nvidia/cuda:11.3.1-cudnn8-devel-centos7 -ARG POLICY=manylinux2014 -ARG PLATFORM=x86_64 -ARG DEVTOOLSET_ROOTPATH= -ARG LD_LIBRARY_PATH_ARG= -ARG PREPEND_PATH= - -#We need both CUDA and manylinux. But the CUDA Toolkit End User License Agreement says NVIDIA CUDA Driver Libraries(libcuda.so, libnvidia-ptxjitcompiler.so) are only distributable in applications that meet this criteria: -#1. The application was developed starting from a NVIDIA CUDA container obtained from Docker Hub or the NVIDIA GPU Cloud, and -#2. The resulting application is packaged as a Docker container and distributed to users on Docker Hub or the NVIDIA GPU Cloud only. -#So we use CUDA as the base image then add manylinux on top of it. - -#Build manylinux2014 docker image begin -FROM $BASEIMAGE AS runtime_base -ARG POLICY -ARG PLATFORM -ARG DEVTOOLSET_ROOTPATH -ARG LD_LIBRARY_PATH_ARG -ARG PREPEND_PATH -LABEL maintainer="The ManyLinux project" - -ENV AUDITWHEEL_POLICY=${POLICY} AUDITWHEEL_ARCH=${PLATFORM} AUDITWHEEL_PLAT=${POLICY}_${PLATFORM} -ENV LC_ALL=en_US.UTF-8 LANG=en_US.UTF-8 LANGUAGE=en_US.UTF-8 -ENV DEVTOOLSET_ROOTPATH=${DEVTOOLSET_ROOTPATH} -ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH_ARG} -ENV PATH=${PREPEND_PATH}${PATH} -ENV PKG_CONFIG_PATH=/usr/local/lib/pkgconfig - -# first copy the fixup mirrors script, keep the script around -COPY build_scripts/fixup-mirrors.sh /usr/local/sbin/fixup-mirrors - -# setup entrypoint, this will wrap commands with `linux32` with i686 images -COPY build_scripts/install-entrypoint.sh \ - build_scripts/update-system-packages.sh \ - build_scripts/build_utils.sh \ - /build_scripts/ - -RUN /build_scripts/install-entrypoint.sh && rm -rf /build_scripts -COPY manylinux-entrypoint /usr/local/bin/manylinux-entrypoint -ENTRYPOINT ["manylinux-entrypoint"] - -COPY build_scripts/install-runtime-packages.sh \ - build_scripts/update-system-packages.sh \ - build_scripts/build_utils.sh \ - /build_scripts/ -RUN manylinux-entrypoint /build_scripts/install-runtime-packages.sh && rm -rf /build_scripts/ - -COPY build_scripts/build_utils.sh /build_scripts/ - -COPY build_scripts/install-autoconf.sh /build_scripts/ -RUN export AUTOCONF_ROOT=autoconf-2.71 && \ - export AUTOCONF_HASH=431075ad0bf529ef13cb41e9042c542381103e80015686222b8a9d4abef42a1c && \ - export AUTOCONF_DOWNLOAD_URL=http://ftp.gnu.org/gnu/autoconf && \ - manylinux-entrypoint /build_scripts/install-autoconf.sh - -COPY build_scripts/install-automake.sh /build_scripts/ -RUN export AUTOMAKE_ROOT=automake-1.16.5 && \ - export AUTOMAKE_HASH=07bd24ad08a64bc17250ce09ec56e921d6343903943e99ccf63bbf0705e34605 && \ - export AUTOMAKE_DOWNLOAD_URL=http://ftp.gnu.org/gnu/automake && \ - manylinux-entrypoint /build_scripts/install-automake.sh - -COPY build_scripts/install-libtool.sh /build_scripts/ -RUN export LIBTOOL_ROOT=libtool-2.4.7 && \ - export LIBTOOL_HASH=04e96c2404ea70c590c546eba4202a4e12722c640016c12b9b2f1ce3d481e9a8 && \ - export LIBTOOL_DOWNLOAD_URL=http://ftp.gnu.org/gnu/libtool && \ - manylinux-entrypoint /build_scripts/install-libtool.sh - -COPY build_scripts/install-libxcrypt.sh /build_scripts/ -RUN export LIBXCRYPT_VERSION=4.4.28 && \ - export LIBXCRYPT_HASH=db7e37901969cb1d1e8020cb73a991ef81e48e31ea5b76a101862c806426b457 && \ - export LIBXCRYPT_DOWNLOAD_URL=https://github.com/besser82/libxcrypt/archive && \ - export PERL_ROOT=perl-5.34.0 && \ - export PERL_HASH=551efc818b968b05216024fb0b727ef2ad4c100f8cb6b43fab615fa78ae5be9a && \ - export PERL_DOWNLOAD_URL=https://www.cpan.org/src/5.0 && \ - manylinux-entrypoint /build_scripts/install-libxcrypt.sh - -FROM runtime_base AS build_base -COPY build_scripts/install-build-packages.sh /build_scripts/ -RUN manylinux-entrypoint /build_scripts/install-build-packages.sh - - -FROM build_base AS build_git -COPY build_scripts/build-git.sh /build_scripts/ -RUN export GIT_ROOT=git-2.36.2 && \ - export GIT_HASH=6dc2cdea5fb23d823ba4871cc23222c1db31dfbb6d6c6ff74c4128700df57c68 && \ - export GIT_DOWNLOAD_URL=https://www.kernel.org/pub/software/scm/git && \ - manylinux-entrypoint /build_scripts/build-git.sh - - -FROM build_base AS build_cpython -COPY build_scripts/build-sqlite3.sh /build_scripts/ -RUN export SQLITE_AUTOCONF_ROOT=sqlite-autoconf-3390200 && \ - export SQLITE_AUTOCONF_HASH=852be8a6183a17ba47cee0bbff7400b7aa5affd283bf3beefc34fcd088a239de && \ - export SQLITE_AUTOCONF_DOWNLOAD_URL=https://www.sqlite.org/2022 && \ - manylinux-entrypoint /build_scripts/build-sqlite3.sh - -COPY build_scripts/build-openssl.sh /build_scripts/ -RUN export OPENSSL_ROOT=openssl-1.1.1q && \ - export OPENSSL_HASH=d7939ce614029cdff0b6c20f0e2e5703158a489a72b2507b8bd51bf8c8fd10ca && \ - export OPENSSL_DOWNLOAD_URL=https://www.openssl.org/source && \ - manylinux-entrypoint /build_scripts/build-openssl.sh - -COPY build_scripts/build-cpython.sh /build_scripts/ - - -FROM build_cpython AS build_cpython37 -COPY build_scripts/cpython-pubkeys.txt /build_scripts/cpython-pubkeys.txt -RUN manylinux-entrypoint /build_scripts/build-cpython.sh 3.7.13 - - -FROM build_cpython AS build_cpython38 -COPY build_scripts/ambv-pubkey.txt /build_scripts/cpython-pubkeys.txt -RUN manylinux-entrypoint /build_scripts/build-cpython.sh 3.8.13 - - -FROM build_cpython AS build_cpython39 -COPY build_scripts/ambv-pubkey.txt /build_scripts/cpython-pubkeys.txt -RUN manylinux-entrypoint /build_scripts/build-cpython.sh 3.9.13 - - -FROM build_cpython AS build_cpython310 -COPY build_scripts/cpython-pubkey-310-311.txt /build_scripts/cpython-pubkeys.txt -RUN manylinux-entrypoint /build_scripts/build-cpython.sh 3.10.5 - -FROM build_cpython AS build_cpython311 -COPY build_scripts/cpython-pubkey-310-311.txt /build_scripts/cpython-pubkeys.txt -RUN manylinux-entrypoint /build_scripts/build-cpython.sh 3.11.0b5 - -FROM build_cpython AS all_python -COPY build_scripts/install-pypy.sh \ - build_scripts/pypy.sha256 \ - build_scripts/finalize-python.sh \ - /build_scripts/ -RUN manylinux-entrypoint /build_scripts/install-pypy.sh 3.7 7.3.9 -RUN manylinux-entrypoint /build_scripts/install-pypy.sh 3.8 7.3.9 -RUN manylinux-entrypoint /build_scripts/install-pypy.sh 3.9 7.3.9 -COPY --from=build_cpython37 /opt/_internal /opt/_internal/ -COPY --from=build_cpython38 /opt/_internal /opt/_internal/ -COPY --from=build_cpython39 /opt/_internal /opt/_internal/ -COPY --from=build_cpython310 /opt/_internal /opt/_internal/ -COPY --from=build_cpython311 /opt/_internal /opt/_internal/ -RUN manylinux-entrypoint /build_scripts/finalize-python.sh - - -FROM runtime_base -COPY --from=build_git /manylinux-rootfs / -COPY --from=build_cpython /manylinux-rootfs / -COPY --from=all_python /opt/_internal /opt/_internal/ -COPY build_scripts/finalize.sh \ - build_scripts/update-system-packages.sh \ - build_scripts/python-tag-abi-tag.py \ - build_scripts/requirements3.7.txt \ - build_scripts/requirements3.8.txt \ - build_scripts/requirements3.9.txt \ - build_scripts/requirements3.10.txt \ - build_scripts/requirements3.11.txt \ - build_scripts/requirements-base-tools.txt \ - /build_scripts/ -COPY build_scripts/requirements-tools/* /build_scripts/requirements-tools/ -RUN manylinux-entrypoint /build_scripts/finalize.sh && rm -rf /build_scripts - -ENV SSL_CERT_FILE=/opt/_internal/certs.pem - -CMD ["/bin/bash"] - -#Build manylinux2014 docker image end -ARG PYTHON_VERSION=3.9 -ARG TORCH_VERSION=1.11.0 -ARG OPSET_VERSION=15 -ARG INSTALL_DEPS_EXTRA_ARGS - -#Add our own dependencies -ADD scripts /tmp/scripts -RUN cd /tmp/scripts && \ - /tmp/scripts/manylinux/install_centos.sh && \ - /tmp/scripts/install_os_deps.sh -d gpu $INSTALL_DEPS_EXTRA_ARGS && \ - /tmp/scripts/install_ninja.sh && \ - /tmp/scripts/install_python_deps.sh -d gpu -v 11.3 -p $PYTHON_VERSION -h $TORCH_VERSION $INSTALL_DEPS_EXTRA_ARGS && \ - rm -rf /tmp/scripts - -ARG BUILD_UID=1001 -ARG BUILD_USER=onnxruntimedev -RUN adduser --uid $BUILD_UID $BUILD_USER -WORKDIR /home/$BUILD_USER -USER $BUILD_USER -ENV PATH /usr/local/gradle/bin:/usr/local/dotnet:$PATH -ENV ORTMODULE_ONNX_OPSET_VERSION=$OPSET_VERSION diff --git a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_training_cuda11_5 b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_training_cuda11_6 similarity index 98% rename from tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_training_cuda11_5 rename to tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_training_cuda11_6 index d79a32fbe5..c453e42212 100644 --- a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_training_cuda11_5 +++ b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_training_cuda11_6 @@ -1,4 +1,4 @@ -ARG BASEIMAGE=nvidia/cuda:11.5.0-cudnn8-devel-centos7 +ARG BASEIMAGE=nvidia/cuda:11.6.2-cudnn8-devel-centos7 ARG POLICY=manylinux2014 ARG PLATFORM=x86_64 ARG DEVTOOLSET_ROOTPATH= @@ -175,7 +175,7 @@ RUN cd /tmp/scripts && \ /tmp/scripts/manylinux/install_centos.sh && \ /tmp/scripts/install_os_deps.sh -d gpu $INSTALL_DEPS_EXTRA_ARGS && \ /tmp/scripts/install_ninja.sh && \ - /tmp/scripts/install_python_deps.sh -d gpu -v 11.5 -p $PYTHON_VERSION -h $TORCH_VERSION $INSTALL_DEPS_EXTRA_ARGS && \ + /tmp/scripts/install_python_deps.sh -d gpu -v 11.6 -p $PYTHON_VERSION -h $TORCH_VERSION $INSTALL_DEPS_EXTRA_ARGS && \ rm -rf /tmp/scripts ARG BUILD_UID=1001 diff --git a/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_torch1.10.0_cu11.1/requirements.txt b/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_torch1.10.0_cu11.1/requirements.txt deleted file mode 100644 index 7a51d5b270..0000000000 --- a/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_torch1.10.0_cu11.1/requirements.txt +++ /dev/null @@ -1,6 +0,0 @@ ---pre --f https://download.pytorch.org/whl/cu111/torch_stable.html -torch==1.10.0+cu111 -torchvision==0.11.0+cu111 -torchtext==0.11.0 -setuptools>=41.4.0 diff --git a/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_torch1.11.0_cu11.5/requirements.txt b/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_torch1.11.0_cu11.6/requirements.txt similarity index 55% rename from tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_torch1.11.0_cu11.5/requirements.txt rename to tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_torch1.11.0_cu11.6/requirements.txt index aad92732cd..8d087ed76b 100644 --- a/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_torch1.11.0_cu11.5/requirements.txt +++ b/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_torch1.11.0_cu11.6/requirements.txt @@ -1,6 +1,6 @@ --pre -f https://download.pytorch.org/whl/torch_stable.html -torch==1.11.0+cu115 -torchvision==0.12.0+cu115 -torchtext==0.12.0 +torch==1.12.1+cu116 +torchvision==0.13.1+cu116 +torchtext==0.13.1 setuptools>=41.4.0