diff --git a/dockerfiles/Dockerfile.nuphar b/dockerfiles/Dockerfile.nuphar index a42dd9c928..2677f08267 100644 --- a/dockerfiles/Dockerfile.nuphar +++ b/dockerfiles/Dockerfile.nuphar @@ -17,7 +17,8 @@ RUN apt-get update && \ ENV PATH="/opt/cmake/bin:${PATH}" RUN git clone --single-branch --branch ${ONNXRUNTIME_SERVER_BRANCH} --recursive ${ONNXRUNTIME_REPO} onnxruntime RUN /onnxruntime/tools/ci_build/github/linux/docker/scripts/install_ubuntu.sh -p ${PYTHON_VERSION} && \ - /onnxruntime/tools/ci_build/github/linux/docker/scripts/install_deps.sh -p ${PYTHON_VERSION} + /onnxruntime/tools/ci_build/github/linux/docker/scripts/install_os_deps.sh && \ + /onnxruntime/tools/ci_build/github/linux/docker/scripts/install_python_deps.sh -p ${PYTHON_VERSION} WORKDIR / diff --git a/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-ortmodule-distributed-test-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-ortmodule-distributed-test-ci-pipeline.yml index b4c063ef38..e042ea42ec 100644 --- a/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-ortmodule-distributed-test-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-ortmodule-distributed-test-ci-pipeline.yml @@ -23,7 +23,8 @@ jobs: --build_wheel \ " \ -m \ - -u + -u \ + -e DisplayName: 'Build' - bash: tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh -p $(orttrainingtestdata-storage-key) -s "//orttrainingtestdata.file.core.windows.net/mnist" -d "/mnist" @@ -41,9 +42,7 @@ jobs: --volume $(Build.BinariesDirectory):/build \ --volume /mnist:/mnist \ onnxruntime_ortmodule_distributed_tests_image \ - /build/RelWithDebInfo/launch_test.py \ - --cmd_line_with_args "python orttraining_ortmodule_distributed_tests.py --mnist /mnist" \ - --cwd /build/RelWithDebInfo \ + bash -c "python3 -m pip install /build/RelWithDebInfo/dist/onnxruntime*.whl ; rm -rf /build/RelWithDebInfo/onnxruntime/ ; /build/RelWithDebInfo/launch_test.py --cmd_line_with_args 'python orttraining_ortmodule_distributed_tests.py --mnist /mnist' --cwd /build/RelWithDebInfo" \ displayName: 'Run orttraining_ortmodule_distributed_tests.py' condition: succeededOrFailed() timeoutInMinutes: 30 diff --git a/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-ortmodule-test-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-ortmodule-test-ci-pipeline.yml index 95162e57cc..b4aa93b824 100644 --- a/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-ortmodule-test-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-ortmodule-test-ci-pipeline.yml @@ -22,7 +22,8 @@ jobs: --update --build \ --build_wheel \ " \ - -u + -u \ + -e DisplayName: 'Build' - bash: tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh -p $(orttrainingtestdata-storage-key) -s "//orttrainingtestdata.file.core.windows.net/mnist" -d "/mnist" @@ -38,6 +39,8 @@ jobs: condition: succeededOrFailed() # Entry point for all ORTModule tests + # The onnxruntime folder is deleted in the build directory + # to enforce use of the onnxruntime wheel - script: | docker run \ --gpus all \ @@ -49,9 +52,7 @@ jobs: --volume /bert_data:/bert_data \ --volume /hf_models_cache:/hf_models_cache \ onnxruntime_ortmodule_tests_image \ - /build/RelWithDebInfo/launch_test.py \ - --cmd_line_with_args "python orttraining_ortmodule_tests.py --mnist /mnist --bert_data /bert_data/hf_data/glue_data/CoLA/original/raw --transformers_cache /hf_models_cache/huggingface/transformers" \ - --cwd /build/RelWithDebInfo \ + bash -c "python3 -m pip install /build/RelWithDebInfo/dist/onnxruntime*.whl ; rm -rf /build/RelWithDebInfo/onnxruntime/ ; /build/RelWithDebInfo/launch_test.py --cmd_line_with_args 'python orttraining_ortmodule_tests.py --mnist /mnist --bert_data /bert_data/hf_data/glue_data/CoLA/original/raw --transformers_cache /hf_models_cache/huggingface/transformers' --cwd /build/RelWithDebInfo" \ displayName: 'Run orttraining_ortmodule_tests.py' condition: succeededOrFailed() timeoutInMinutes: 60 diff --git a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_rocm b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_rocm index 889b5268fe..eb5cd9b3ce 100644 --- a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_rocm +++ b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_rocm @@ -25,7 +25,8 @@ ARG INSTALL_DEPS_EXTRA_ARGS ADD scripts /tmp/scripts RUN cd /tmp/scripts && \ /tmp/scripts/install_centos.sh && \ - /tmp/scripts/install_deps.sh -d gpu -p $PYTHON_VERSION $INSTALL_DEPS_EXTRA_ARGS && \ + /tmp/scripts/install_os_deps.sh -d gpu $INSTALL_DEPS_EXTRA_ARGS && \ + /tmp/scripts/install_python_deps.sh -d gpu -p $PYTHON_VERSION $INSTALL_DEPS_EXTRA_ARGS && \ rm -rf /tmp/scripts ARG BUILD_UID=1001 diff --git a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_training_cuda10_2 b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_training_cuda10_2 index 35348c1db0..ed853d7ffb 100644 --- a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_training_cuda10_2 +++ b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_training_cuda10_2 @@ -32,7 +32,8 @@ ARG INSTALL_DEPS_EXTRA_ARGS ADD scripts /tmp/scripts RUN cd /tmp/scripts && \ /tmp/scripts/install_centos.sh && \ - /tmp/scripts/install_deps.sh -d gpu -v 10.2 -p $PYTHON_VERSION $INSTALL_DEPS_EXTRA_ARGS && \ + /tmp/scripts/install_os_deps.sh -d gpu $INSTALL_DEPS_EXTRA_ARGS && \ + /tmp/scripts/install_python_deps.sh -d gpu -v 10.2 -p $PYTHON_VERSION $INSTALL_DEPS_EXTRA_ARGS && \ rm -rf /tmp/scripts ARG BUILD_UID=1001 diff --git a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_training_cuda11_1 b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_training_cuda11_1 index b9c1547164..60e02b0e59 100644 --- a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_training_cuda11_1 +++ b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_training_cuda11_1 @@ -32,7 +32,8 @@ ARG INSTALL_DEPS_EXTRA_ARGS ADD scripts /tmp/scripts RUN cd /tmp/scripts && \ /tmp/scripts/install_centos.sh && \ - /tmp/scripts/install_deps.sh -d gpu -v 11.1 -p $PYTHON_VERSION $INSTALL_DEPS_EXTRA_ARGS && \ + /tmp/scripts/install_os_deps.sh -d gpu $INSTALL_DEPS_EXTRA_ARGS && \ + /tmp/scripts/install_python_deps.sh -d gpu -v 11.1 -p $PYTHON_VERSION $INSTALL_DEPS_EXTRA_ARGS && \ rm -rf /tmp/scripts ARG BUILD_UID=1001 diff --git a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu index 5f8cff18e3..11672000a8 100644 --- a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu +++ b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu @@ -4,7 +4,7 @@ FROM ubuntu:${OS_VERSION} ARG PYTHON_VERSION=3.6 ADD scripts /tmp/scripts -RUN /tmp/scripts/install_ubuntu.sh -p $PYTHON_VERSION && /tmp/scripts/install_deps.sh -p $PYTHON_VERSION && rm -rf /tmp/scripts +RUN /tmp/scripts/install_ubuntu.sh -p $PYTHON_VERSION && /tmp/scripts/install_os_deps.sh && /tmp/scripts/install_python_deps.sh -p $PYTHON_VERSION && rm -rf /tmp/scripts WORKDIR /root diff --git a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_for_arm b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_for_arm index b1ffd33a54..ad973a263d 100644 --- a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_for_arm +++ b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_for_arm @@ -4,7 +4,8 @@ FROM ubuntu:${OS_VERSION} ARG PYTHON_VERSION=3.5 ADD scripts /tmp/scripts RUN /tmp/scripts/install_ubuntu.sh -p $PYTHON_VERSION -d EdgeDevice && \ - /tmp/scripts/install_deps.sh -p $PYTHON_VERSION -d EdgeDevice && \ + /tmp/scripts/install_os_deps.sh -d EdgeDevice && \ + /tmp/scripts/install_python_deps.sh -p $PYTHON_VERSION -d EdgeDevice && \ /tmp/scripts/install_protobuf.sh ARG TOOL_CHAIN="fsl-imx-xwayland-glibc-x86_64-fsl-image-qt5-aarch64-toolchain-4.19-warrior.sh" diff --git a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_gpu b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_gpu index 94c2e9f099..8f748bba31 100644 --- a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_gpu +++ b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_gpu @@ -5,7 +5,8 @@ ARG INSTALL_DEPS_EXTRA_ARGS ADD scripts /tmp/scripts RUN /tmp/scripts/install_ubuntu.sh -p $PYTHON_VERSION && \ - /tmp/scripts/install_deps.sh -p $PYTHON_VERSION -d gpu $INSTALL_DEPS_EXTRA_ARGS && \ + /tmp/scripts/install_os_deps.sh -d gpu $INSTALL_DEPS_EXTRA_ARGS && \ + /tmp/scripts/install_python_deps.sh -p $PYTHON_VERSION -d gpu $INSTALL_DEPS_EXTRA_ARGS && \ rm -rf /tmp/scripts WORKDIR /root diff --git a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_gpu_training b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_gpu_training index 4a19397e2b..5948a473b4 100644 --- a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_gpu_training +++ b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_gpu_training @@ -2,11 +2,16 @@ FROM nvidia/cuda:11.1.1-cudnn8-devel-ubuntu18.04 ARG PYTHON_VERSION=3.6 ARG INSTALL_DEPS_EXTRA_ARGS +ARG USE_CONDA=false ADD scripts /tmp/scripts RUN /tmp/scripts/install_ubuntu.sh -p $PYTHON_VERSION && \ - /tmp/scripts/install_deps.sh -p $PYTHON_VERSION -d gpu $INSTALL_DEPS_EXTRA_ARGS && \ - rm -rf /tmp/scripts + /tmp/scripts/install_os_deps.sh -p $PYTHON_VERSION -d gpu $INSTALL_DEPS_EXTRA_ARGS + +# If USE_CONDA is false, use root to install python dependencies. +RUN if [ "$USE_CONDA" = false ] ; \ + then /tmp/scripts/install_python_deps.sh -p $PYTHON_VERSION -d gpu $INSTALL_DEPS_EXTRA_ARGS ; \ + fi WORKDIR /root @@ -26,3 +31,28 @@ ARG BUILD_UID=1000 RUN adduser --gecos 'onnxruntime Build User' --disabled-password $BUILD_USER --uid $BUILD_UID WORKDIR /home/$BUILD_USER USER $BUILD_USER + +ARG MINICONDA_PREFIX=/home/$BUILD_USER/miniconda3 +RUN if [ "$USE_CONDA" = true ] ; \ + then MINICONDA=miniconda.sh && \ + wget --no-verbose https://repo.anaconda.com/miniconda/Miniconda3-py37_4.9.2-Linux-x86_64.sh -O $MINICONDA && \ + chmod a+x $MINICONDA && \ + ./$MINICONDA -b -p $MINICONDA_PREFIX && \ + rm ./$MINICONDA && \ + $MINICONDA_PREFIX/bin/conda clean --yes --all && \ + $MINICONDA_PREFIX/bin/conda install -y python=$PYTHON_VERSION ; \ + fi + +ENV PATH /home/$BUILD_USER/miniconda3/bin:$PATH + +# If USE_CONDA is true, use onnxruntimedev user to install python dependencies +RUN if [ "$USE_CONDA" = true ] ; \ + then /tmp/scripts/install_python_deps.sh -p $PYTHON_VERSION -d gpu $INSTALL_DEPS_EXTRA_ARGS -c ; \ + fi + +WORKDIR /root +USER root +RUN rm -rf /tmp/scripts + +WORKDIR /home/$BUILD_USER +USER $BUILD_USER diff --git a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_openvino b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_openvino index ccf6b148c2..4c43cd6cb7 100644 --- a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_openvino +++ b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_openvino @@ -6,7 +6,8 @@ ARG OPENVINO_VERSION=2021.3 ADD scripts /tmp/scripts RUN /tmp/scripts/install_ubuntu.sh -p $PYTHON_VERSION -d EdgeDevice && \ - /tmp/scripts/install_deps.sh -p $PYTHON_VERSION -d EdgeDevice + /tmp/scripts/install_os_deps.sh -d EdgeDevice && \ + /tmp/scripts/install_python_deps.sh -p $PYTHON_VERSION -d EdgeDevice RUN apt update && apt install -y libnuma1 ocl-icd-libopencl1 && \ rm -rf /var/lib/apt/lists/* /tmp/scripts diff --git a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_tensorrt b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_tensorrt index 077e228a06..60eb641ea2 100644 --- a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_tensorrt +++ b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_tensorrt @@ -8,7 +8,7 @@ ARG PYTHON_VERSION=3.8 ARG DEBIAN_FRONTEND=noninteractive ADD scripts /tmp/scripts -RUN /tmp/scripts/install_ubuntu.sh -p $PYTHON_VERSION && /tmp/scripts/install_deps.sh -p $PYTHON_VERSION && rm -rf /tmp/scripts \ +RUN /tmp/scripts/install_ubuntu.sh -p $PYTHON_VERSION && /tmp/scripts/install_os_deps.sh && /tmp/scripts/install_python_deps.sh -p $PYTHON_VERSION && rm -rf /tmp/scripts \ && rm /usr/local/bin/cmake && rm /usr/local/bin/ctest && rm -r /usr/local/share/cmake-3.14 WORKDIR /root diff --git a/tools/ci_build/github/linux/docker/scripts/install_deps.sh b/tools/ci_build/github/linux/docker/scripts/install_os_deps.sh similarity index 51% rename from tools/ci_build/github/linux/docker/scripts/install_deps.sh rename to tools/ci_build/github/linux/docker/scripts/install_os_deps.sh index 1fa12ab794..2754353899 100755 --- a/tools/ci_build/github/linux/docker/scripts/install_deps.sh +++ b/tools/ci_build/github/linux/docker/scripts/install_os_deps.sh @@ -2,27 +2,16 @@ set -e -x SCRIPT_DIR="$( dirname "${BASH_SOURCE[0]}" )" -INSTALL_DEPS_TRAINING=false INSTALL_DEPS_DISTRIBUTED_SETUP=false -ORTMODULE_BUILD=false -TARGET_ROCM=false -CU_VER="11.1" -while getopts p:d:v:tmur parameter_Option +while getopts d:m parameter_Option do case "${parameter_Option}" in -p) PYTHON_VER=${OPTARG};; d) DEVICE_TYPE=${OPTARG};; -v) CU_VER=${OPTARG};; -t) INSTALL_DEPS_TRAINING=true;; m) INSTALL_DEPS_DISTRIBUTED_SETUP=true;; -u) ORTMODULE_BUILD=true;; -r) TARGET_ROCM=true;; esac done -echo "Python version=$PYTHON_VER" - DEVICE_TYPE=${DEVICE_TYPE:=Normal} #Download a file from internet @@ -59,20 +48,6 @@ function GetFile { return $? } -if [[ "$PYTHON_VER" = "3.5" && -d "/opt/python/cp35-cp35m" ]]; then - PYTHON_EXE="/opt/python/cp35-cp35m/bin/python3.5" -elif [[ "$PYTHON_VER" = "3.6" && -d "/opt/python/cp36-cp36m" ]]; then - PYTHON_EXE="/opt/python/cp36-cp36m/bin/python3.6" -elif [[ "$PYTHON_VER" = "3.7" && -d "/opt/python/cp37-cp37m" ]]; then - PYTHON_EXE="/opt/python/cp37-cp37m/bin/python3.7" -elif [[ "$PYTHON_VER" = "3.8" && -d "/opt/python/cp38-cp38" ]]; then - PYTHON_EXE="/opt/python/cp38-cp38/bin/python3.8" -elif [[ "$PYTHON_VER" = "3.9" && -d "/opt/python/cp39-cp39" ]]; then - PYTHON_EXE="/opt/python/cp39-cp39/bin/python3.9" -else - PYTHON_EXE="/usr/bin/python${PYTHON_VER}" -fi - SYS_LONG_BIT=$(getconf LONG_BIT) mkdir -p /tmp/src GLIBC_VERSION=$(getconf GNU_LIBC_VERSION | cut -f 2 -d \.) @@ -114,43 +89,14 @@ unzip gradle-6.3-bin.zip mv /tmp/src/gradle-6.3 /usr/local/gradle if ! [ -x "$(command -v protoc)" ]; then - source ${0/%install_deps\.sh/install_protobuf\.sh} + source ${0/%install_os_deps\.sh/install_protobuf\.sh} fi -export ONNX_ML=1 -export CMAKE_ARGS="-DONNX_GEN_PB_TYPE_STUBS=OFF -DONNX_WERROR=OFF" -${PYTHON_EXE} -m pip install -r ${0/%install_deps\.sh/requirements\.txt} if [ $DEVICE_TYPE = "gpu" ]; then - if [[ $INSTALL_DEPS_TRAINING = true ]]; then - if [[ $ORTMODULE_BUILD = false ]]; then - ${PYTHON_EXE} -m pip install -r ${0/%install_deps.sh/training\/requirements.txt} - else - if [[ $TARGET_ROCM = false ]]; then - ${PYTHON_EXE} -m pip install -r ${0/%install_deps.sh/training\/ortmodule\/stage1\/requirements_torch_cu${CU_VER}.txt} - # Due to a [bug on DeepSpeed](https://github.com/microsoft/DeepSpeed/issues/663), we install it separately through ortmodule/stage2/requirements.txt - ${PYTHON_EXE} -m pip install -r ${0/%install_deps.sh/training\/ortmodule\/stage2\/requirements.txt} - else - ${PYTHON_EXE} -m pip install \ - --pre -f https://download.pytorch.org/whl/nightly/rocm4.1/torch_nightly.html \ - torch torchvision torchtext - ${PYTHON_EXE} -m pip install -r ${0/%install_deps.sh/training\/ortmodule\/stage1\/requirements-rocm.txt} - ${PYTHON_EXE} -m pip install fairscale - # remove triton requirement from getting triggered in requirements-sparse_attn.txt - git clone https://github.com/ROCmSoftwarePlatform/DeepSpeed - cd DeepSpeed &&\ - rm requirements/requirements-sparse_attn.txt &&\ - ${PYTHON_EXE} setup.py bdist_wheel &&\ - ${PYTHON_EXE} -m pip install dist/deepspeed*.whl &&\ - cd .. - fi - fi - fi if [[ $INSTALL_DEPS_DISTRIBUTED_SETUP = true ]]; then - source ${0/%install_deps.sh/install_openmpi.sh} + source ${0/%install_os_deps.sh/install_openmpi.sh} fi fi cd / rm -rf /tmp/src -rm -rf /usr/include/google -rm -rf /usr/$LIBDIR/libproto* diff --git a/tools/ci_build/github/linux/docker/scripts/install_python_deps.sh b/tools/ci_build/github/linux/docker/scripts/install_python_deps.sh new file mode 100755 index 0000000000..9c6bc7188d --- /dev/null +++ b/tools/ci_build/github/linux/docker/scripts/install_python_deps.sh @@ -0,0 +1,75 @@ +#!/bin/bash +set -e -x + +INSTALL_DEPS_TRAINING=false +INSTALL_DEPS_DISTRIBUTED_SETUP=false +ORTMODULE_BUILD=false +TARGET_ROCM=false +CU_VER="11.1" +USE_CONDA=false + +while getopts p:d:v:tmurc parameter_Option +do case "${parameter_Option}" +in +p) PYTHON_VER=${OPTARG};; +d) DEVICE_TYPE=${OPTARG};; +v) CU_VER=${OPTARG};; +t) INSTALL_DEPS_TRAINING=true;; +m) INSTALL_DEPS_DISTRIBUTED_SETUP=true;; +u) ORTMODULE_BUILD=true;; +r) TARGET_ROCM=true;; +c) USE_CONDA=true;; +esac +done + +echo "Python version=$PYTHON_VER" + +DEVICE_TYPE=${DEVICE_TYPE:=Normal} + +if [[ $USE_CONDA = true ]]; then + # conda python version has already been installed by + # tools/ci_build/github/linux/docker/Dockerfile.ubuntu_gpu_training. + # so, /home/onnxruntimedev/miniconda3/bin/python should point + # to the correct version of the python version + PYTHON_EXE="/home/onnxruntimedev/miniconda3/bin/python" +elif [[ "$PYTHON_VER" = "3.6" && -d "/opt/python/cp36-cp36m" ]]; then + PYTHON_EXE="/opt/python/cp36-cp36m/bin/python3.6" +elif [[ "$PYTHON_VER" = "3.7" && -d "/opt/python/cp37-cp37m" ]]; then + PYTHON_EXE="/opt/python/cp37-cp37m/bin/python3.7" +elif [[ "$PYTHON_VER" = "3.8" && -d "/opt/python/cp38-cp38" ]]; then + PYTHON_EXE="/opt/python/cp38-cp38/bin/python3.8" +elif [[ "$PYTHON_VER" = "3.9" && -d "/opt/python/cp39-cp39" ]]; then + PYTHON_EXE="/opt/python/cp39-cp39/bin/python3.9" +else + PYTHON_EXE="/usr/bin/python${PYTHON_VER}" +fi + +export ONNX_ML=1 +export CMAKE_ARGS="-DONNX_GEN_PB_TYPE_STUBS=OFF -DONNX_WERROR=OFF" +${PYTHON_EXE} -m pip install -r ${0/%install_python_deps\.sh/requirements\.txt} +if [ $DEVICE_TYPE = "gpu" ]; then + if [[ $INSTALL_DEPS_TRAINING = true ]]; then + if [[ $ORTMODULE_BUILD = false ]]; then + ${PYTHON_EXE} -m pip install -r ${0/%install_python_deps.sh/training\/requirements.txt} + else + if [[ $TARGET_ROCM = false ]]; then + ${PYTHON_EXE} -m pip install -r ${0/%install_python_deps.sh/training\/ortmodule\/stage1\/requirements_torch_cu${CU_VER}.txt} + # Due to a [bug on DeepSpeed](https://github.com/microsoft/DeepSpeed/issues/663), we install it separately through ortmodule/stage2/requirements.txt + ${PYTHON_EXE} -m pip install -r ${0/%install_python_deps.sh/training\/ortmodule\/stage2\/requirements.txt} + else + ${PYTHON_EXE} -m pip install \ + --pre -f https://download.pytorch.org/whl/nightly/rocm4.1/torch_nightly.html \ + torch torchvision torchtext + ${PYTHON_EXE} -m pip install -r ${0/%install_python_deps.sh/training\/ortmodule\/stage1\/requirements-rocm.txt} + ${PYTHON_EXE} -m pip install fairscale + # remove triton requirement from getting triggered in requirements-sparse_attn.txt + git clone https://github.com/ROCmSoftwarePlatform/DeepSpeed + cd DeepSpeed &&\ + rm requirements/requirements-sparse_attn.txt &&\ + ${PYTHON_EXE} setup.py bdist_wheel &&\ + ${PYTHON_EXE} -m pip install dist/deepspeed*.whl &&\ + cd .. + fi + fi + fi +fi diff --git a/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage2/requirements.txt b/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage2/requirements.txt index 92d60d2f4c..17d0b64389 100644 --- a/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage2/requirements.txt +++ b/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage2/requirements.txt @@ -6,5 +6,5 @@ tensorboard h5py wget pytorch-lightning==1.2.5 -deepspeed +deepspeed==0.3.15 fairscale diff --git a/tools/ci_build/github/linux/run_dockerbuild.sh b/tools/ci_build/github/linux/run_dockerbuild.sh index d202a60d44..012d8a945c 100755 --- a/tools/ci_build/github/linux/run_dockerbuild.sh +++ b/tools/ci_build/github/linux/run_dockerbuild.sh @@ -7,10 +7,11 @@ CUDA_VER=cuda10.1-cudnn7.6 YOCTO_VERSION="4.19" INSTALL_DEPS_DISTRIBUTED_SETUP=false ORTMODULE_BUILD=false +USE_CONDA=false ALLOW_RELEASED_ONNX_OPSET_ONLY_ENV="ALLOW_RELEASED_ONNX_OPSET_ONLY="$ALLOW_RELEASED_ONNX_OPSET_ONLY echo "ALLOW_RELEASED_ONNX_OPSET_ONLY environment variable is set as "$ALLOW_RELEASED_ONNX_OPSET_ONLY_ENV -while getopts c:o:d:r:p:x:a:v:y:t:i:mu parameter_Option +while getopts c:o:d:r:p:x:a:v:y:t:i:mue parameter_Option do case "${parameter_Option}" in #android, ubuntu16.04, ubuntu18.04, CentOS7 @@ -39,6 +40,8 @@ i) IMAGE_CACHE_CONTAINER_REGISTRY_NAME=${OPTARG};; m) INSTALL_DEPS_DISTRIBUTED_SETUP=true;; # install ortmodule specific dependencies u) ORTMODULE_BUILD=true;; +# install and use conda +e) USE_CONDA=true;; esac done @@ -91,7 +94,7 @@ else INSTALL_DEPS_EXTRA_ARGS="${INSTALL_DEPS_EXTRA_ARGS} -u" fi $GET_DOCKER_IMAGE_CMD --repository "onnxruntime-$IMAGE" \ - --docker-build-args="--build-arg BUILD_USER=onnxruntimedev --build-arg BUILD_UID=$(id -u) --build-arg PYTHON_VERSION=${PYTHON_VER} --build-arg INSTALL_DEPS_EXTRA_ARGS=\"${INSTALL_DEPS_EXTRA_ARGS}\"" \ + --docker-build-args="--build-arg BUILD_USER=onnxruntimedev --build-arg BUILD_UID=$(id -u) --build-arg PYTHON_VERSION=${PYTHON_VER} --build-arg INSTALL_DEPS_EXTRA_ARGS=\"${INSTALL_DEPS_EXTRA_ARGS}\" --build-arg USE_CONDA=${USE_CONDA}" \ --dockerfile $DOCKER_FILE --context . elif [ $BUILD_DEVICE = "tensorrt" ]; then # TensorRT container release 20.12