onnxruntime/tools/ci_build/github/linux/docker/scripts/install_deps.sh
Suffian Khan 7a3c1787af
Add CI pipeline to publish Python training package targeting Rocm (#7417)
* first attempt rocm training wheel

* modifications needed to python packaging pipeline for Rocm 4.1

* changges to not conflict with cuda

missed stage1 changes

remove package push

add option r to getopt

try again without python install

try again without python install

try again without python install

split pipelines and add back push to remote storage

try on cuda gpu pool

try again

try again

try running without az subscription set

try again on original pipeline

change pool

passing AMD Rocm whl on AMD-GPU pool

split rocm pipeline from cuda pipeline

remove comments

* try adding Rocm tests as well

* try with tests in place

* fix trailing ws

* add training data

* try again as root for tests

* use python3

* typo

* try to map video, render group into container

* try again

* try again

* try to avoid yum error code

* make UID 1001

* try without yum downgrade

* define rocm_version=None

* remove CUDA related comments for Rocm Dockerfile

* Dont pin nightly torch torchvision torchtext versions as they expire (for now nightly is required for Rocm 4.1)

* missed requirements-rocm.txt from last commit

* fix whitespace
2021-04-23 17:22:31 -07:00

154 lines
5.2 KiB
Bash
Executable file

#!/bin/bash
set -e -x
SCRIPT_DIR="$( dirname "${BASH_SOURCE[0]}" )"
INSTALL_DEPS_TRAINING=false
INSTALL_DEPS_DISTRIBUTED_SETUP=false
ORTMODULE_BUILD=false
TARGET_ROCM=false
while getopts p:d:tmur parameter_Option
do case "${parameter_Option}"
in
p) PYTHON_VER=${OPTARG};;
d) DEVICE_TYPE=${OPTARG};;
t) INSTALL_DEPS_TRAINING=true;;
m) INSTALL_DEPS_DISTRIBUTED_SETUP=true;;
u) ORTMODULE_BUILD=true;;
r) TARGET_ROCM=true;;
esac
done
echo "Python version=$PYTHON_VER"
DEVICE_TYPE=${DEVICE_TYPE:=Normal}
#Download a file from internet
function GetFile {
local uri=$1
local path=$2
local force=${3:-false}
local download_retries=${4:-5}
local retry_wait_time_seconds=${5:-30}
if [[ -f $path ]]; then
if [[ $force = false ]]; then
echo "File '$path' already exists. Skipping download"
return 0
else
rm -rf $path
fi
fi
if [[ -f $uri ]]; then
echo "'$uri' is a file path, copying file to '$path'"
cp $uri $path
return $?
fi
echo "Downloading $uri"
# Use aria2c if available, otherwise use curl
if command -v aria2c > /dev/null; then
aria2c -q -d $(dirname $path) -o $(basename $path) "$uri"
else
curl "$uri" -sSL --retry $download_retries --retry-delay $retry_wait_time_seconds --create-dirs -o "$path" --fail
fi
return $?
}
if [[ "$PYTHON_VER" = "3.5" && -d "/opt/python/cp35-cp35m" ]]; then
PYTHON_EXE="/opt/python/cp35-cp35m/bin/python3.5"
elif [[ "$PYTHON_VER" = "3.6" && -d "/opt/python/cp36-cp36m" ]]; then
PYTHON_EXE="/opt/python/cp36-cp36m/bin/python3.6"
elif [[ "$PYTHON_VER" = "3.7" && -d "/opt/python/cp37-cp37m" ]]; then
PYTHON_EXE="/opt/python/cp37-cp37m/bin/python3.7"
elif [[ "$PYTHON_VER" = "3.8" && -d "/opt/python/cp38-cp38" ]]; then
PYTHON_EXE="/opt/python/cp38-cp38/bin/python3.8"
elif [[ "$PYTHON_VER" = "3.9" && -d "/opt/python/cp39-cp39" ]]; then
PYTHON_EXE="/opt/python/cp39-cp39/bin/python3.9"
else
PYTHON_EXE="/usr/bin/python${PYTHON_VER}"
fi
SYS_LONG_BIT=$(getconf LONG_BIT)
mkdir -p /tmp/src
GLIBC_VERSION=$(getconf GNU_LIBC_VERSION | cut -f 2 -d \.)
DISTRIBUTOR=$(lsb_release -i -s)
if [[ "$DISTRIBUTOR" = "CentOS" && $SYS_LONG_BIT = "64" ]]; then
LIBDIR="lib64"
else
LIBDIR="lib"
fi
if [[ $SYS_LONG_BIT = "64" && "$GLIBC_VERSION" -gt "9" ]]; then
echo "Installing azcopy"
mkdir -p /tmp/azcopy
GetFile https://aka.ms/downloadazcopy-v10-linux /tmp/azcopy/azcopy.tar.gz
tar --strip 1 -xf /tmp/azcopy/azcopy.tar.gz -C /tmp/azcopy
cp /tmp/azcopy/azcopy /usr/bin
echo "Installing cmake"
GetFile https://github.com/Kitware/CMake/releases/download/v3.18.2/cmake-3.18.2-Linux-x86_64.tar.gz /tmp/src/cmake-3.18.2-Linux-x86_64.tar.gz
tar -zxf /tmp/src/cmake-3.18.2-Linux-x86_64.tar.gz --strip=1 -C /usr
echo "Installing Node.js"
GetFile https://nodejs.org/dist/v12.16.3/node-v12.16.3-linux-x64.tar.xz /tmp/src/node-v12.16.3-linux-x64.tar.xz
tar -xf /tmp/src/node-v12.16.3-linux-x64.tar.xz --strip=1 -C /usr
else
echo "Installing cmake"
GetFile https://github.com/Kitware/CMake/releases/download/v3.18.2/cmake-3.18.2.tar.gz /tmp/src/cmake-3.18.2.tar.gz
tar -xf /tmp/src/cmake-3.18.2.tar.gz -C /tmp/src
pushd .
cd /tmp/src/cmake-3.18.2
./bootstrap --prefix=/usr --parallel=$(getconf _NPROCESSORS_ONLN) --system-bzip2 --system-curl --system-zlib --system-expat
make -j$(getconf _NPROCESSORS_ONLN)
make install
popd
fi
GetFile https://downloads.gradle-dn.com/distributions/gradle-6.3-bin.zip /tmp/src/gradle-6.3-bin.zip
cd /tmp/src
unzip gradle-6.3-bin.zip
mv /tmp/src/gradle-6.3 /usr/local/gradle
if ! [ -x "$(command -v protoc)" ]; then
source ${0/%install_deps\.sh/install_protobuf\.sh}
fi
export ONNX_ML=1
export CMAKE_ARGS="-DONNX_GEN_PB_TYPE_STUBS=OFF -DONNX_WERROR=OFF"
${PYTHON_EXE} -m pip install -r ${0/%install_deps\.sh/requirements\.txt}
if [ $DEVICE_TYPE = "gpu" ]; then
if [[ $INSTALL_DEPS_TRAINING = true ]]; then
if [[ $ORTMODULE_BUILD = false ]]; then
${PYTHON_EXE} -m pip install -r ${0/%install_deps.sh/training\/requirements.txt}
else
if [[ $TARGET_ROCM = false ]]; then
${PYTHON_EXE} -m pip install -r ${0/%install_deps.sh/training\/ortmodule\/stage1\/requirements.txt}
# Due to a [bug on DeepSpeed](https://github.com/microsoft/DeepSpeed/issues/663), we install it separately through ortmodule/stage2/requirements.txt
${PYTHON_EXE} -m pip install -r ${0/%install_deps.sh/training\/ortmodule\/stage2\/requirements.txt}
else
${PYTHON_EXE} -m pip install \
--pre -f https://download.pytorch.org/whl/nightly/rocm4.1/torch_nightly.html \
torch torchvision torchtext
${PYTHON_EXE} -m pip install -r ${0/%install_deps.sh/training\/ortmodule\/stage1\/requirements-rocm.txt}
${PYTHON_EXE} -m pip install fairscale
# remove triton requirement from getting triggered in requirements-sparse_attn.txt
git clone https://github.com/ROCmSoftwarePlatform/DeepSpeed
cd DeepSpeed &&\
rm requirements/requirements-sparse_attn.txt &&\
${PYTHON_EXE} setup.py bdist_wheel &&\
${PYTHON_EXE} -m pip install dist/deepspeed*.whl &&\
cd ..
fi
fi
fi
if [[ $INSTALL_DEPS_DISTRIBUTED_SETUP = true ]]; then
source ${0/%install_deps.sh/install_openmpi.sh}
fi
fi
cd /
rm -rf /tmp/src
rm -rf /usr/include/google
rm -rf /usr/$LIBDIR/libproto*