Install and use conda on ortmodule CI pipelines (#7530)

* Install and use conda on ortmodule CI pipelines

* Update build script to install onnxruntime wheel before running unit tests

* Remove python 3.5 from install_python_deps

* Pinning deepspeed version to 0.3.15
This commit is contained in:
baijumeswani 2021-05-03 15:52:22 -07:00 committed by GitHub
parent ad15811ade
commit cab84d902e
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
16 changed files with 140 additions and 79 deletions

View file

@ -17,7 +17,8 @@ RUN apt-get update && \
ENV PATH="/opt/cmake/bin:${PATH}"
RUN git clone --single-branch --branch ${ONNXRUNTIME_SERVER_BRANCH} --recursive ${ONNXRUNTIME_REPO} onnxruntime
RUN /onnxruntime/tools/ci_build/github/linux/docker/scripts/install_ubuntu.sh -p ${PYTHON_VERSION} && \
/onnxruntime/tools/ci_build/github/linux/docker/scripts/install_deps.sh -p ${PYTHON_VERSION}
/onnxruntime/tools/ci_build/github/linux/docker/scripts/install_os_deps.sh && \
/onnxruntime/tools/ci_build/github/linux/docker/scripts/install_python_deps.sh -p ${PYTHON_VERSION}
WORKDIR /

View file

@ -23,7 +23,8 @@ jobs:
--build_wheel \
" \
-m \
-u
-u \
-e
DisplayName: 'Build'
- bash: tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh -p $(orttrainingtestdata-storage-key) -s "//orttrainingtestdata.file.core.windows.net/mnist" -d "/mnist"
@ -41,9 +42,7 @@ jobs:
--volume $(Build.BinariesDirectory):/build \
--volume /mnist:/mnist \
onnxruntime_ortmodule_distributed_tests_image \
/build/RelWithDebInfo/launch_test.py \
--cmd_line_with_args "python orttraining_ortmodule_distributed_tests.py --mnist /mnist" \
--cwd /build/RelWithDebInfo \
bash -c "python3 -m pip install /build/RelWithDebInfo/dist/onnxruntime*.whl ; rm -rf /build/RelWithDebInfo/onnxruntime/ ; /build/RelWithDebInfo/launch_test.py --cmd_line_with_args 'python orttraining_ortmodule_distributed_tests.py --mnist /mnist' --cwd /build/RelWithDebInfo" \
displayName: 'Run orttraining_ortmodule_distributed_tests.py'
condition: succeededOrFailed()
timeoutInMinutes: 30

View file

@ -22,7 +22,8 @@ jobs:
--update --build \
--build_wheel \
" \
-u
-u \
-e
DisplayName: 'Build'
- bash: tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh -p $(orttrainingtestdata-storage-key) -s "//orttrainingtestdata.file.core.windows.net/mnist" -d "/mnist"
@ -38,6 +39,8 @@ jobs:
condition: succeededOrFailed()
# Entry point for all ORTModule tests
# The onnxruntime folder is deleted in the build directory
# to enforce use of the onnxruntime wheel
- script: |
docker run \
--gpus all \
@ -49,9 +52,7 @@ jobs:
--volume /bert_data:/bert_data \
--volume /hf_models_cache:/hf_models_cache \
onnxruntime_ortmodule_tests_image \
/build/RelWithDebInfo/launch_test.py \
--cmd_line_with_args "python orttraining_ortmodule_tests.py --mnist /mnist --bert_data /bert_data/hf_data/glue_data/CoLA/original/raw --transformers_cache /hf_models_cache/huggingface/transformers" \
--cwd /build/RelWithDebInfo \
bash -c "python3 -m pip install /build/RelWithDebInfo/dist/onnxruntime*.whl ; rm -rf /build/RelWithDebInfo/onnxruntime/ ; /build/RelWithDebInfo/launch_test.py --cmd_line_with_args 'python orttraining_ortmodule_tests.py --mnist /mnist --bert_data /bert_data/hf_data/glue_data/CoLA/original/raw --transformers_cache /hf_models_cache/huggingface/transformers' --cwd /build/RelWithDebInfo" \
displayName: 'Run orttraining_ortmodule_tests.py'
condition: succeededOrFailed()
timeoutInMinutes: 60

View file

@ -25,7 +25,8 @@ ARG INSTALL_DEPS_EXTRA_ARGS
ADD scripts /tmp/scripts
RUN cd /tmp/scripts && \
/tmp/scripts/install_centos.sh && \
/tmp/scripts/install_deps.sh -d gpu -p $PYTHON_VERSION $INSTALL_DEPS_EXTRA_ARGS && \
/tmp/scripts/install_os_deps.sh -d gpu $INSTALL_DEPS_EXTRA_ARGS && \
/tmp/scripts/install_python_deps.sh -d gpu -p $PYTHON_VERSION $INSTALL_DEPS_EXTRA_ARGS && \
rm -rf /tmp/scripts
ARG BUILD_UID=1001

View file

@ -32,7 +32,8 @@ ARG INSTALL_DEPS_EXTRA_ARGS
ADD scripts /tmp/scripts
RUN cd /tmp/scripts && \
/tmp/scripts/install_centos.sh && \
/tmp/scripts/install_deps.sh -d gpu -v 10.2 -p $PYTHON_VERSION $INSTALL_DEPS_EXTRA_ARGS && \
/tmp/scripts/install_os_deps.sh -d gpu $INSTALL_DEPS_EXTRA_ARGS && \
/tmp/scripts/install_python_deps.sh -d gpu -v 10.2 -p $PYTHON_VERSION $INSTALL_DEPS_EXTRA_ARGS && \
rm -rf /tmp/scripts
ARG BUILD_UID=1001

View file

@ -32,7 +32,8 @@ ARG INSTALL_DEPS_EXTRA_ARGS
ADD scripts /tmp/scripts
RUN cd /tmp/scripts && \
/tmp/scripts/install_centos.sh && \
/tmp/scripts/install_deps.sh -d gpu -v 11.1 -p $PYTHON_VERSION $INSTALL_DEPS_EXTRA_ARGS && \
/tmp/scripts/install_os_deps.sh -d gpu $INSTALL_DEPS_EXTRA_ARGS && \
/tmp/scripts/install_python_deps.sh -d gpu -v 11.1 -p $PYTHON_VERSION $INSTALL_DEPS_EXTRA_ARGS && \
rm -rf /tmp/scripts
ARG BUILD_UID=1001

View file

@ -4,7 +4,7 @@ FROM ubuntu:${OS_VERSION}
ARG PYTHON_VERSION=3.6
ADD scripts /tmp/scripts
RUN /tmp/scripts/install_ubuntu.sh -p $PYTHON_VERSION && /tmp/scripts/install_deps.sh -p $PYTHON_VERSION && rm -rf /tmp/scripts
RUN /tmp/scripts/install_ubuntu.sh -p $PYTHON_VERSION && /tmp/scripts/install_os_deps.sh && /tmp/scripts/install_python_deps.sh -p $PYTHON_VERSION && rm -rf /tmp/scripts
WORKDIR /root

View file

@ -4,7 +4,8 @@ FROM ubuntu:${OS_VERSION}
ARG PYTHON_VERSION=3.5
ADD scripts /tmp/scripts
RUN /tmp/scripts/install_ubuntu.sh -p $PYTHON_VERSION -d EdgeDevice && \
/tmp/scripts/install_deps.sh -p $PYTHON_VERSION -d EdgeDevice && \
/tmp/scripts/install_os_deps.sh -d EdgeDevice && \
/tmp/scripts/install_python_deps.sh -p $PYTHON_VERSION -d EdgeDevice && \
/tmp/scripts/install_protobuf.sh
ARG TOOL_CHAIN="fsl-imx-xwayland-glibc-x86_64-fsl-image-qt5-aarch64-toolchain-4.19-warrior.sh"

View file

@ -5,7 +5,8 @@ ARG INSTALL_DEPS_EXTRA_ARGS
ADD scripts /tmp/scripts
RUN /tmp/scripts/install_ubuntu.sh -p $PYTHON_VERSION && \
/tmp/scripts/install_deps.sh -p $PYTHON_VERSION -d gpu $INSTALL_DEPS_EXTRA_ARGS && \
/tmp/scripts/install_os_deps.sh -d gpu $INSTALL_DEPS_EXTRA_ARGS && \
/tmp/scripts/install_python_deps.sh -p $PYTHON_VERSION -d gpu $INSTALL_DEPS_EXTRA_ARGS && \
rm -rf /tmp/scripts
WORKDIR /root

View file

@ -2,11 +2,16 @@ FROM nvidia/cuda:11.1.1-cudnn8-devel-ubuntu18.04
ARG PYTHON_VERSION=3.6
ARG INSTALL_DEPS_EXTRA_ARGS
ARG USE_CONDA=false
ADD scripts /tmp/scripts
RUN /tmp/scripts/install_ubuntu.sh -p $PYTHON_VERSION && \
/tmp/scripts/install_deps.sh -p $PYTHON_VERSION -d gpu $INSTALL_DEPS_EXTRA_ARGS && \
rm -rf /tmp/scripts
/tmp/scripts/install_os_deps.sh -p $PYTHON_VERSION -d gpu $INSTALL_DEPS_EXTRA_ARGS
# If USE_CONDA is false, use root to install python dependencies.
RUN if [ "$USE_CONDA" = false ] ; \
then /tmp/scripts/install_python_deps.sh -p $PYTHON_VERSION -d gpu $INSTALL_DEPS_EXTRA_ARGS ; \
fi
WORKDIR /root
@ -26,3 +31,28 @@ ARG BUILD_UID=1000
RUN adduser --gecos 'onnxruntime Build User' --disabled-password $BUILD_USER --uid $BUILD_UID
WORKDIR /home/$BUILD_USER
USER $BUILD_USER
ARG MINICONDA_PREFIX=/home/$BUILD_USER/miniconda3
RUN if [ "$USE_CONDA" = true ] ; \
then MINICONDA=miniconda.sh && \
wget --no-verbose https://repo.anaconda.com/miniconda/Miniconda3-py37_4.9.2-Linux-x86_64.sh -O $MINICONDA && \
chmod a+x $MINICONDA && \
./$MINICONDA -b -p $MINICONDA_PREFIX && \
rm ./$MINICONDA && \
$MINICONDA_PREFIX/bin/conda clean --yes --all && \
$MINICONDA_PREFIX/bin/conda install -y python=$PYTHON_VERSION ; \
fi
ENV PATH /home/$BUILD_USER/miniconda3/bin:$PATH
# If USE_CONDA is true, use onnxruntimedev user to install python dependencies
RUN if [ "$USE_CONDA" = true ] ; \
then /tmp/scripts/install_python_deps.sh -p $PYTHON_VERSION -d gpu $INSTALL_DEPS_EXTRA_ARGS -c ; \
fi
WORKDIR /root
USER root
RUN rm -rf /tmp/scripts
WORKDIR /home/$BUILD_USER
USER $BUILD_USER

View file

@ -6,7 +6,8 @@ ARG OPENVINO_VERSION=2021.3
ADD scripts /tmp/scripts
RUN /tmp/scripts/install_ubuntu.sh -p $PYTHON_VERSION -d EdgeDevice && \
/tmp/scripts/install_deps.sh -p $PYTHON_VERSION -d EdgeDevice
/tmp/scripts/install_os_deps.sh -d EdgeDevice && \
/tmp/scripts/install_python_deps.sh -p $PYTHON_VERSION -d EdgeDevice
RUN apt update && apt install -y libnuma1 ocl-icd-libopencl1 && \
rm -rf /var/lib/apt/lists/* /tmp/scripts

View file

@ -8,7 +8,7 @@ ARG PYTHON_VERSION=3.8
ARG DEBIAN_FRONTEND=noninteractive
ADD scripts /tmp/scripts
RUN /tmp/scripts/install_ubuntu.sh -p $PYTHON_VERSION && /tmp/scripts/install_deps.sh -p $PYTHON_VERSION && rm -rf /tmp/scripts \
RUN /tmp/scripts/install_ubuntu.sh -p $PYTHON_VERSION && /tmp/scripts/install_os_deps.sh && /tmp/scripts/install_python_deps.sh -p $PYTHON_VERSION && rm -rf /tmp/scripts \
&& rm /usr/local/bin/cmake && rm /usr/local/bin/ctest && rm -r /usr/local/share/cmake-3.14
WORKDIR /root

View file

@ -2,27 +2,16 @@
set -e -x
SCRIPT_DIR="$( dirname "${BASH_SOURCE[0]}" )"
INSTALL_DEPS_TRAINING=false
INSTALL_DEPS_DISTRIBUTED_SETUP=false
ORTMODULE_BUILD=false
TARGET_ROCM=false
CU_VER="11.1"
while getopts p:d:v:tmur parameter_Option
while getopts d:m parameter_Option
do case "${parameter_Option}"
in
p) PYTHON_VER=${OPTARG};;
d) DEVICE_TYPE=${OPTARG};;
v) CU_VER=${OPTARG};;
t) INSTALL_DEPS_TRAINING=true;;
m) INSTALL_DEPS_DISTRIBUTED_SETUP=true;;
u) ORTMODULE_BUILD=true;;
r) TARGET_ROCM=true;;
esac
done
echo "Python version=$PYTHON_VER"
DEVICE_TYPE=${DEVICE_TYPE:=Normal}
#Download a file from internet
@ -59,20 +48,6 @@ function GetFile {
return $?
}
if [[ "$PYTHON_VER" = "3.5" && -d "/opt/python/cp35-cp35m" ]]; then
PYTHON_EXE="/opt/python/cp35-cp35m/bin/python3.5"
elif [[ "$PYTHON_VER" = "3.6" && -d "/opt/python/cp36-cp36m" ]]; then
PYTHON_EXE="/opt/python/cp36-cp36m/bin/python3.6"
elif [[ "$PYTHON_VER" = "3.7" && -d "/opt/python/cp37-cp37m" ]]; then
PYTHON_EXE="/opt/python/cp37-cp37m/bin/python3.7"
elif [[ "$PYTHON_VER" = "3.8" && -d "/opt/python/cp38-cp38" ]]; then
PYTHON_EXE="/opt/python/cp38-cp38/bin/python3.8"
elif [[ "$PYTHON_VER" = "3.9" && -d "/opt/python/cp39-cp39" ]]; then
PYTHON_EXE="/opt/python/cp39-cp39/bin/python3.9"
else
PYTHON_EXE="/usr/bin/python${PYTHON_VER}"
fi
SYS_LONG_BIT=$(getconf LONG_BIT)
mkdir -p /tmp/src
GLIBC_VERSION=$(getconf GNU_LIBC_VERSION | cut -f 2 -d \.)
@ -114,43 +89,14 @@ unzip gradle-6.3-bin.zip
mv /tmp/src/gradle-6.3 /usr/local/gradle
if ! [ -x "$(command -v protoc)" ]; then
source ${0/%install_deps\.sh/install_protobuf\.sh}
source ${0/%install_os_deps\.sh/install_protobuf\.sh}
fi
export ONNX_ML=1
export CMAKE_ARGS="-DONNX_GEN_PB_TYPE_STUBS=OFF -DONNX_WERROR=OFF"
${PYTHON_EXE} -m pip install -r ${0/%install_deps\.sh/requirements\.txt}
if [ $DEVICE_TYPE = "gpu" ]; then
if [[ $INSTALL_DEPS_TRAINING = true ]]; then
if [[ $ORTMODULE_BUILD = false ]]; then
${PYTHON_EXE} -m pip install -r ${0/%install_deps.sh/training\/requirements.txt}
else
if [[ $TARGET_ROCM = false ]]; then
${PYTHON_EXE} -m pip install -r ${0/%install_deps.sh/training\/ortmodule\/stage1\/requirements_torch_cu${CU_VER}.txt}
# Due to a [bug on DeepSpeed](https://github.com/microsoft/DeepSpeed/issues/663), we install it separately through ortmodule/stage2/requirements.txt
${PYTHON_EXE} -m pip install -r ${0/%install_deps.sh/training\/ortmodule\/stage2\/requirements.txt}
else
${PYTHON_EXE} -m pip install \
--pre -f https://download.pytorch.org/whl/nightly/rocm4.1/torch_nightly.html \
torch torchvision torchtext
${PYTHON_EXE} -m pip install -r ${0/%install_deps.sh/training\/ortmodule\/stage1\/requirements-rocm.txt}
${PYTHON_EXE} -m pip install fairscale
# remove triton requirement from getting triggered in requirements-sparse_attn.txt
git clone https://github.com/ROCmSoftwarePlatform/DeepSpeed
cd DeepSpeed &&\
rm requirements/requirements-sparse_attn.txt &&\
${PYTHON_EXE} setup.py bdist_wheel &&\
${PYTHON_EXE} -m pip install dist/deepspeed*.whl &&\
cd ..
fi
fi
fi
if [[ $INSTALL_DEPS_DISTRIBUTED_SETUP = true ]]; then
source ${0/%install_deps.sh/install_openmpi.sh}
source ${0/%install_os_deps.sh/install_openmpi.sh}
fi
fi
cd /
rm -rf /tmp/src
rm -rf /usr/include/google
rm -rf /usr/$LIBDIR/libproto*

View file

@ -0,0 +1,75 @@
#!/bin/bash
set -e -x
INSTALL_DEPS_TRAINING=false
INSTALL_DEPS_DISTRIBUTED_SETUP=false
ORTMODULE_BUILD=false
TARGET_ROCM=false
CU_VER="11.1"
USE_CONDA=false
while getopts p:d:v:tmurc parameter_Option
do case "${parameter_Option}"
in
p) PYTHON_VER=${OPTARG};;
d) DEVICE_TYPE=${OPTARG};;
v) CU_VER=${OPTARG};;
t) INSTALL_DEPS_TRAINING=true;;
m) INSTALL_DEPS_DISTRIBUTED_SETUP=true;;
u) ORTMODULE_BUILD=true;;
r) TARGET_ROCM=true;;
c) USE_CONDA=true;;
esac
done
echo "Python version=$PYTHON_VER"
DEVICE_TYPE=${DEVICE_TYPE:=Normal}
if [[ $USE_CONDA = true ]]; then
# conda python version has already been installed by
# tools/ci_build/github/linux/docker/Dockerfile.ubuntu_gpu_training.
# so, /home/onnxruntimedev/miniconda3/bin/python should point
# to the correct version of the python version
PYTHON_EXE="/home/onnxruntimedev/miniconda3/bin/python"
elif [[ "$PYTHON_VER" = "3.6" && -d "/opt/python/cp36-cp36m" ]]; then
PYTHON_EXE="/opt/python/cp36-cp36m/bin/python3.6"
elif [[ "$PYTHON_VER" = "3.7" && -d "/opt/python/cp37-cp37m" ]]; then
PYTHON_EXE="/opt/python/cp37-cp37m/bin/python3.7"
elif [[ "$PYTHON_VER" = "3.8" && -d "/opt/python/cp38-cp38" ]]; then
PYTHON_EXE="/opt/python/cp38-cp38/bin/python3.8"
elif [[ "$PYTHON_VER" = "3.9" && -d "/opt/python/cp39-cp39" ]]; then
PYTHON_EXE="/opt/python/cp39-cp39/bin/python3.9"
else
PYTHON_EXE="/usr/bin/python${PYTHON_VER}"
fi
export ONNX_ML=1
export CMAKE_ARGS="-DONNX_GEN_PB_TYPE_STUBS=OFF -DONNX_WERROR=OFF"
${PYTHON_EXE} -m pip install -r ${0/%install_python_deps\.sh/requirements\.txt}
if [ $DEVICE_TYPE = "gpu" ]; then
if [[ $INSTALL_DEPS_TRAINING = true ]]; then
if [[ $ORTMODULE_BUILD = false ]]; then
${PYTHON_EXE} -m pip install -r ${0/%install_python_deps.sh/training\/requirements.txt}
else
if [[ $TARGET_ROCM = false ]]; then
${PYTHON_EXE} -m pip install -r ${0/%install_python_deps.sh/training\/ortmodule\/stage1\/requirements_torch_cu${CU_VER}.txt}
# Due to a [bug on DeepSpeed](https://github.com/microsoft/DeepSpeed/issues/663), we install it separately through ortmodule/stage2/requirements.txt
${PYTHON_EXE} -m pip install -r ${0/%install_python_deps.sh/training\/ortmodule\/stage2\/requirements.txt}
else
${PYTHON_EXE} -m pip install \
--pre -f https://download.pytorch.org/whl/nightly/rocm4.1/torch_nightly.html \
torch torchvision torchtext
${PYTHON_EXE} -m pip install -r ${0/%install_python_deps.sh/training\/ortmodule\/stage1\/requirements-rocm.txt}
${PYTHON_EXE} -m pip install fairscale
# remove triton requirement from getting triggered in requirements-sparse_attn.txt
git clone https://github.com/ROCmSoftwarePlatform/DeepSpeed
cd DeepSpeed &&\
rm requirements/requirements-sparse_attn.txt &&\
${PYTHON_EXE} setup.py bdist_wheel &&\
${PYTHON_EXE} -m pip install dist/deepspeed*.whl &&\
cd ..
fi
fi
fi
fi

View file

@ -6,5 +6,5 @@ tensorboard
h5py
wget
pytorch-lightning==1.2.5
deepspeed
deepspeed==0.3.15
fairscale

View file

@ -7,10 +7,11 @@ CUDA_VER=cuda10.1-cudnn7.6
YOCTO_VERSION="4.19"
INSTALL_DEPS_DISTRIBUTED_SETUP=false
ORTMODULE_BUILD=false
USE_CONDA=false
ALLOW_RELEASED_ONNX_OPSET_ONLY_ENV="ALLOW_RELEASED_ONNX_OPSET_ONLY="$ALLOW_RELEASED_ONNX_OPSET_ONLY
echo "ALLOW_RELEASED_ONNX_OPSET_ONLY environment variable is set as "$ALLOW_RELEASED_ONNX_OPSET_ONLY_ENV
while getopts c:o:d:r:p:x:a:v:y:t:i:mu parameter_Option
while getopts c:o:d:r:p:x:a:v:y:t:i:mue parameter_Option
do case "${parameter_Option}"
in
#android, ubuntu16.04, ubuntu18.04, CentOS7
@ -39,6 +40,8 @@ i) IMAGE_CACHE_CONTAINER_REGISTRY_NAME=${OPTARG};;
m) INSTALL_DEPS_DISTRIBUTED_SETUP=true;;
# install ortmodule specific dependencies
u) ORTMODULE_BUILD=true;;
# install and use conda
e) USE_CONDA=true;;
esac
done
@ -91,7 +94,7 @@ else
INSTALL_DEPS_EXTRA_ARGS="${INSTALL_DEPS_EXTRA_ARGS} -u"
fi
$GET_DOCKER_IMAGE_CMD --repository "onnxruntime-$IMAGE" \
--docker-build-args="--build-arg BUILD_USER=onnxruntimedev --build-arg BUILD_UID=$(id -u) --build-arg PYTHON_VERSION=${PYTHON_VER} --build-arg INSTALL_DEPS_EXTRA_ARGS=\"${INSTALL_DEPS_EXTRA_ARGS}\"" \
--docker-build-args="--build-arg BUILD_USER=onnxruntimedev --build-arg BUILD_UID=$(id -u) --build-arg PYTHON_VERSION=${PYTHON_VER} --build-arg INSTALL_DEPS_EXTRA_ARGS=\"${INSTALL_DEPS_EXTRA_ARGS}\" --build-arg USE_CONDA=${USE_CONDA}" \
--dockerfile $DOCKER_FILE --context .
elif [ $BUILD_DEVICE = "tensorrt" ]; then
# TensorRT container release 20.12