Add CI pipeline to publish Python training package targeting Rocm (#7417)

* first attempt rocm training wheel

* modifications needed to python packaging pipeline for Rocm 4.1

* changges to not conflict with cuda

missed stage1 changes

remove package push

add option r to getopt

try again without python install

try again without python install

try again without python install

split pipelines and add back push to remote storage

try on cuda gpu pool

try again

try again

try running without az subscription set

try again on original pipeline

change pool

passing AMD Rocm whl on AMD-GPU pool

split rocm pipeline from cuda pipeline

remove comments

* try adding Rocm tests as well

* try with tests in place

* fix trailing ws

* add training data

* try again as root for tests

* use python3

* typo

* try to map video, render group into container

* try again

* try again

* try to avoid yum error code

* make UID 1001

* try without yum downgrade

* define rocm_version=None

* remove CUDA related comments for Rocm Dockerfile

* Dont pin nightly torch torchvision torchtext versions as they expire (for now nightly is required for Rocm 4.1)

* missed requirements-rocm.txt from last commit

* fix whitespace
This commit is contained in:
Suffian Khan 2021-04-23 17:22:31 -07:00 committed by GitHub
parent 34ebf7d3dd
commit 7a3c1787af
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
9 changed files with 359 additions and 7 deletions

View file

@ -47,12 +47,16 @@ if parse_arg_remove_boolean(sys.argv, '--nightly_build'):
wheel_name_suffix = parse_arg_remove_string(sys.argv, '--wheel_name_suffix=')
cuda_version = None
rocm_version = None
# The following arguments are mutually exclusive
if parse_arg_remove_boolean(sys.argv, '--use_tensorrt'):
package_name = 'onnxruntime-gpu-tensorrt' if not nightly_build else 'ort-trt-nightly'
elif parse_arg_remove_boolean(sys.argv, '--use_cuda'):
package_name = 'onnxruntime-gpu' if not nightly_build else 'ort-gpu-nightly'
cuda_version = parse_arg_remove_string(sys.argv, '--cuda_version=')
elif parse_arg_remove_boolean(sys.argv, '--use_rocm'):
package_name = 'onnxruntime-rocm' if not nightly_build else 'ort-rocm-nightly'
rocm_version = parse_arg_remove_string(sys.argv, '--rocm_version=')
elif parse_arg_remove_boolean(sys.argv, '--use_openvino'):
package_name = 'onnxruntime-openvino'
elif parse_arg_remove_boolean(sys.argv, '--use_dnnl'):
@ -131,6 +135,7 @@ try:
copyfile(source, dest)
result = subprocess.run(['patchelf', '--print-needed', dest], check=True, stdout=subprocess.PIPE, universal_newlines=True)
cuda_dependencies = ['libcublas.so', 'libcudnn.so', 'libcudart.so', 'libcurand.so', 'libcufft.so', 'libnvToolsExt.so']
cuda_dependencies.extend(['librccl.so', 'libamdhip64.so', 'librocblas.so', 'libMIOpen.so', 'libhsa-runtime64.so', 'libhsakmt.so'])
to_preload = []
args = ['patchelf', '--debug']
for line in result.stdout.split('\n'):
@ -255,6 +260,11 @@ if enable_training:
# removing '.' to make Cuda version number in the same form as Pytorch.
cuda_version = cuda_version.replace('.', '')
local_version = '+cu' + cuda_version
if rocm_version:
# removing '.' to make Cuda version number in the same form as Pytorch.
rocm_version = rocm_version.replace('.', '')
local_version = '+rocm' + rocm_version
package_data = {}
data_files = []

View file

@ -499,6 +499,8 @@ def parse_arguments():
parser.add_argument("--disable_ort_format_load", action='store_true',
help='Disable support for loading ORT format models in a non-minimal build.')
parser.add_argument(
"--rocm_version", help="The version of ROCM stack to use. ")
parser.add_argument("--use_rocm", action='store_true', help="Build with ROCm")
parser.add_argument("--rocm_home", help="Path to ROCm installation dir")
@ -642,6 +644,7 @@ def generate_build_tree(cmake_path, source_dir, build_dir, cuda_home, cudnn_home
"-DPYTHON_EXECUTABLE=" + sys.executable,
"-Donnxruntime_USE_CUDA=" + ("ON" if args.use_cuda else "OFF"),
"-Donnxruntime_CUDA_VERSION=" + (args.cuda_version if args.use_cuda else ""),
"-Donnxruntime_ROCM_VERSION=" + (args.rocm_version if args.use_rocm else ""),
"-Donnxruntime_CUDA_HOME=" + (cuda_home if args.use_cuda else ""),
"-Donnxruntime_CUDNN_HOME=" + (cudnn_home if args.use_cuda else ""),
"-Donnxruntime_USE_FEATURIZERS=" + ("ON" if args.use_featurizers else "OFF"),
@ -1489,7 +1492,7 @@ def run_nodejs_tests(nodejs_binding_dir):
def build_python_wheel(
source_dir, build_dir, configs, use_cuda, cuda_version, use_dnnl,
source_dir, build_dir, configs, use_cuda, cuda_version, use_rocm, rocm_version, use_dnnl,
use_tensorrt, use_openvino, use_nuphar, use_vitisai, use_acl, use_armnn, use_dml,
wheel_name_suffix, enable_training, nightly_build=False, featurizers_build=False, use_ninja=False):
for config in configs:
@ -1527,6 +1530,10 @@ def build_python_wheel(
args.append('--use_cuda')
if cuda_version:
args.append('--cuda_version={}'.format(cuda_version))
elif use_rocm:
args.append('--use_rocm')
if rocm_version:
args.append('--rocm_version={}'.format(rocm_version))
elif use_openvino:
args.append('--use_openvino')
elif use_dnnl:
@ -1999,6 +2006,8 @@ def main():
raise BuildError("cuda_version must be specified on Windows.")
else:
args.cuda_version = ""
if args.use_rocm and args.rocm_version is None:
args.rocm_version = ""
generate_build_tree(
cmake_path, source_dir, build_dir, cuda_home, cudnn_home, rocm_home, mpi_home, nccl_home,
tensorrt_home, migraphx_home, acl_home, acl_libs, armnn_home, armnn_libs,
@ -2037,6 +2046,8 @@ def main():
configs,
args.use_cuda,
args.cuda_version,
args.use_rocm,
args.rocm_version,
args.use_dnnl,
args.use_tensorrt,
args.use_openvino,

View file

@ -0,0 +1,14 @@
trigger: none
stages:
- template: templates/py-packaging-stage.yml
parameters:
build_py_parameters: --enable_training
enable_linux_cpu: false
enable_linux_gpu: false
enable_linux_gpu_training: false
enable_linux_rocm_training: true
enable_windows_cpu: false
enable_windows_gpu: false
enable_mac_cpu: false
enable_linux_arm: false

View file

@ -7,6 +7,7 @@ stages:
enable_linux_cpu: false
enable_linux_gpu: false
enable_linux_gpu_training: true
enable_linux_rocm_training: false
enable_windows_cpu: false
enable_windows_gpu: false
enable_mac_cpu: false

View file

@ -20,6 +20,11 @@ parameters:
type: boolean
default: false
- name: enable_linux_rocm_training
displayName: 'Whether Linux ROCM package is built.'
type: boolean
default: false
- name: enable_windows_cpu
displayName: 'Whether Windows CPU package is built.'
type: boolean
@ -217,6 +222,256 @@ stages:
- template: clean-agent-build-directory-step.yml
- ${{ if eq(parameters.enable_linux_rocm_training, true) }}:
- job: Linux_py_ROCM_Wheels
timeoutInMinutes: 180
workspace:
clean: all
pool: AMD-GPU
# pool: Onnxruntime-Linux-GPU
strategy:
matrix:
Python36:
PythonVersion: '3.6'
Python37:
PythonVersion: '3.7'
Python38:
PythonVersion: '3.8'
# dependency PyTorch does not support Python 3.9 yet
# Python39:
# PythonVersion: '3.9'
steps:
- checkout: self
clean: true
submodules: recursive
- template: set-python-manylinux-variables-step.yml
- template: get-docker-image-steps.yml
parameters:
Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_rocm
Context: tools/ci_build/github/linux/docker
DockerBuildArgs: >-
--build-arg PYTHON_VERSION=$(PythonVersion)
--build-arg INSTALL_DEPS_EXTRA_ARGS=-tmur
--build-arg BUILD_UID=$(id -u)
Repository: onnxruntimetrainingrocmbuild
- task: CmdLine@2
inputs:
script: |
docker run --rm \
--privileged \
--ipc=host \
--network=host \
--cap-add=SYS_PTRACE \
--security-opt seccomp=unconfined \
-e CC=/opt/rh/devtoolset-8/root/usr/bin/cc -e CXX=/opt/rh/devtoolset-8/root/usr/bin/c++ -e CFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" -e CXXFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" \
--volume $(Build.SourcesDirectory):/onnxruntime_src \
--volume $(Build.BinariesDirectory):/build \
--workdir /onnxruntime_src \
--entrypoint $(PythonManylinuxDir)/bin/python3 \
-e NVIDIA_VISIBLE_DEVICES=all \
-e NIGHTLY_BUILD \
-e BUILD_BUILDNUMBER \
--user onnxruntimedev \
onnxruntimetrainingrocmbuild \
/onnxruntime_src/tools/ci_build/build.py \
--config Release \
--use_rocm \
--rocm_version=4.1 \
--rocm_home=/opt/rocm \
--nccl_home=/opt/rocm \
--update \
--parallel \
--build_dir /build \
--build \
--build_wheel \
--skip_tests \
${{ parameters.build_py_parameters }} \
--cmake_extra_defines PYTHON_INCLUDE_DIR=$(PythonManylinuxIncludeDir) PYTHON_LIBRARY=/usr/lib64/librt.so \
workingDirectory: $(Build.SourcesDirectory)
displayName: 'Build onnxruntime (in container)'
- script: |-
python3 orttraining/tools/ci_test/download_azure_blob_archive.py \
--azure_blob_url https://onnxruntimetestdata.blob.core.windows.net/training/onnxruntime_training_data.zip?snapshot=2020-06-15T23:17:35.8314853Z \
--target_dir $(Build.SourcesDirectory)/training_e2e_test_data \
--archive_sha256_digest B01C169B6550D1A0A6F1B4E2F34AE2A8714B52DBB70AC04DA85D371F691BDFF9
displayName: 'Download onnxruntime_training_data.zip data'
- script: |-
echo "Tests will run using HIP_VISIBLES_DEVICES=$HIP_VISIBLE_DEVICES"
video_gid=$(getent group | awk '/video/ {split($0,a,":"); print(a[3])}')
echo "##vso[task.setvariable variable=video]$video_gid"
render_gid=$(getent group | awk '/render/ {split($0,a,":"); print(a[3])}')
echo "##vso[task.setvariable variable=render]$render_gid"
displayName: 'Find video and render gid to be mapped into container'
- script: |-
echo "video=$video"
echo "render=$render"
docker run --rm \
--device=/dev/kfd \
--device=/dev/dri \
--group-add $(video) \
--group-add $(render) \
--privileged \
--ipc=host \
--network=host \
--cap-add=SYS_PTRACE \
--security-opt seccomp=unconfined \
--volume $(Build.SourcesDirectory):/onnxruntime_src \
--volume $(Build.BinariesDirectory):/build \
--workdir /build/Release \
--entrypoint /bin/bash \
-e HIP_VISIBLE_DEVICES \
-e NIGHTLY_BUILD \
-e BUILD_BUILDNUMBER \
--user onnxruntimedev \
onnxruntimetrainingrocmbuild \
/onnxruntime_src/tools/ci_build/github/pai/pai_test_launcher.sh
displayName: 'Run onnxruntime unit tests (in container)'
- script: |-
docker run --rm \
--device=/dev/kfd \
--device=/dev/dri \
--group-add $(video) \
--group-add $(render) \
--privileged \
--ipc=host \
--network=host \
--cap-add=SYS_PTRACE \
--security-opt seccomp=unconfined \
--volume $(Build.SourcesDirectory):/onnxruntime_src \
--volume $(Build.BinariesDirectory):/build \
--workdir /onnxruntime_src \
--entrypoint $(PythonManylinuxDir)/bin/python3 \
-e HIP_VISIBLE_DEVICES \
-e NIGHTLY_BUILD \
-e BUILD_BUILDNUMBER \
--user onnxruntimedev \
onnxruntimetrainingrocmbuild \
orttraining/tools/ci_test/run_batch_size_test.py \
--binary_dir /build/Release \
--model_root training_e2e_test_data/models \
--gpu_sku MI100_32G
displayName: 'Run C++ BERT-L batch size test (in container)'
condition: succeededOrFailed() # ensure all tests are run
- script: |-
docker run --rm \
--device=/dev/kfd \
--device=/dev/dri \
--group-add $(video) \
--group-add $(render) \
--privileged \
--ipc=host \
--network=host \
--cap-add=SYS_PTRACE \
--security-opt seccomp=unconfined \
--volume $(Build.SourcesDirectory):/onnxruntime_src \
--volume $(Build.BinariesDirectory):/build \
--workdir /onnxruntime_src \
--entrypoint $(PythonManylinuxDir)/bin/python3 \
-e HIP_VISIBLE_DEVICES \
-e NIGHTLY_BUILD \
-e BUILD_BUILDNUMBER \
--user onnxruntimedev \
onnxruntimetrainingrocmbuild \
orttraining/tools/ci_test/run_bert_perf_test.py \
--binary_dir /build/Release \
--model_root training_e2e_test_data/models \
--training_data_root training_e2e_test_data/data \
--gpu_sku MI100_32G
displayName: 'Run C++ BERT-L performance test (in container)'
condition: succeededOrFailed() # ensure all tests are run
- script: |-
docker run --rm \
--device=/dev/kfd \
--device=/dev/dri \
--group-add $(video) \
--group-add $(render) \
--privileged \
--ipc=host \
--network=host \
--cap-add=SYS_PTRACE \
--security-opt seccomp=unconfined \
--volume $(Build.SourcesDirectory):/onnxruntime_src \
--volume $(Build.BinariesDirectory):/build \
--workdir /onnxruntime_src \
--entrypoint $(PythonManylinuxDir)/bin/python3 \
-e HIP_VISIBLE_DEVICES \
-e NIGHTLY_BUILD \
-e BUILD_BUILDNUMBER \
--user onnxruntimedev \
onnxruntimetrainingrocmbuild \
orttraining/tools/ci_test/run_convergence_test.py \
--binary_dir /build/Release \
--model_root training_e2e_test_data/models \
--training_data_root training_e2e_test_data/data \
--gpu_sku MI100_32G
displayName: 'Run C++ BERT-L convergence test (in container)'
condition: succeededOrFailed() # ensure all tests are run
- task: CopyFiles@2
displayName: 'Copy Python Wheel to: $(Build.ArtifactStagingDirectory)'
inputs:
SourceFolder: '$(Build.BinariesDirectory)'
Contents: 'Release/dist/*.whl'
TargetFolder: '$(Build.ArtifactStagingDirectory)'
- task: CmdLine@2
displayName: 'Build Python Documentation'
condition: ne(variables['PythonVersion'], '3.9') # tensorflow not available on python 3.9
inputs:
script: |
mkdir -p $HOME/.onnx
docker run --rm \
--volume $(Build.SourcesDirectory):/onnxruntime_src \
--volume $(Build.BinariesDirectory):/build \
-e NIGHTLY_BUILD \
-e BUILD_BUILDNUMBER \
--entrypoint /bin/bash \
onnxruntimetrainingrocmbuild \
/onnxruntime_src/tools/doc/builddoc.sh $(PythonManylinuxDir)/bin/ /onnxruntime_src /build Release
workingDirectory: $(Build.SourcesDirectory)
- task: CopyFiles@2
displayName: 'Copy Python Documentation to: $(Build.ArtifactStagingDirectory)'
condition: ne(variables['PythonVersion'], '3.9') # tensorflow not available on python 3.9
inputs:
SourceFolder: '$(Build.BinariesDirectory)/docs/training/html'
Contents: '**'
TargetFolder: '$(Build.ArtifactStagingDirectory)/training_html_doc'
- task: PublishBuildArtifacts@1
displayName: 'Upload Rocm wheel as build artifact'
inputs:
ArtifactName: onnxruntime_rocm
- script: |
python3 -m pip install azure-storage-blob==2.1.0
files=($(Build.ArtifactStagingDirectory)/Release/dist/*.whl) && \
echo ${files[0]} && \
python3 tools/ci_build/upload_python_package_to_azure_storage.py \
--python_wheel_path ${files[0]} \
--account_name onnxruntimepackages \
--account_key $(orttrainingpackagestorageaccountkey) \
--container_name '$web'
condition: and(succeeded(), eq(variables['DRY_RUN'], '0'))
displayName: 'Upload Rocm wheel to release repository'
- template: component-governance-component-detection-steps.yml
parameters:
condition: 'succeeded'
- template: clean-agent-build-directory-step.yml
- ${{ if eq(parameters.enable_linux_gpu_training, true) }}:
- job: Linux_py_GPU_Wheels
timeoutInMinutes: 180

View file

@ -0,0 +1,36 @@
FROM rocm/pytorch:rocm4.1.1_centos7_py3.6_pytorch
#Build manylinux2014 docker image begin
ENV AUDITWHEEL_ARCH x86_64
ENV AUDITWHEEL_PLAT manylinux2014_$AUDITWHEEL_ARCH
ENV LC_ALL en_US.UTF-8
ENV LANG en_US.UTF-8
ENV LANGUAGE en_US.UTF-8
ENV DEVTOOLSET_ROOTPATH /opt/rh/devtoolset-8/root
ENV PATH $DEVTOOLSET_ROOTPATH/usr/bin:$PATH
ENV LD_LIBRARY_PATH $DEVTOOLSET_ROOTPATH/usr/lib64:$DEVTOOLSET_ROOTPATH/usr/lib:$DEVTOOLSET_ROOTPATH/usr/lib64/dyninst:$DEVTOOLSET_ROOTPATH/usr/lib/dyninst:/usr/local/lib64:/usr/local/lib
ENV PKG_CONFIG_PATH /usr/local/lib/pkgconfig
COPY manylinux2014_build_scripts /manylinux2014_build_scripts
RUN bash /manylinux2014_build_scripts/build.sh 8 && rm -r /manylinux2014_build_scripts
ENV SSL_CERT_FILE=/opt/_internal/certs.pem
#Build manylinux2014 docker image end
ARG PYTHON_VERSION=3.6
ARG INSTALL_DEPS_EXTRA_ARGS
#Add our own dependencies
ADD scripts /tmp/scripts
RUN cd /tmp/scripts && \
/tmp/scripts/install_centos.sh && \
/tmp/scripts/install_deps.sh -d gpu -p $PYTHON_VERSION $INSTALL_DEPS_EXTRA_ARGS && \
rm -rf /tmp/scripts
ARG BUILD_UID=1001
ARG BUILD_USER=onnxruntimedev
RUN adduser --uid $BUILD_UID $BUILD_USER
WORKDIR /home/$BUILD_USER
USER $BUILD_USER
ENV PATH /usr/local/gradle/bin:/usr/local/dotnet:$PATH

View file

@ -66,8 +66,8 @@ TOOLCHAIN_DEPS="devtoolset-$1-binutils devtoolset-$1-gcc devtoolset-$1-gcc-c++ d
if [ "${AUDITWHEEL_ARCH}" == "x86_64" ]; then
# Software collection (for devtoolset-$1)
yum -y install centos-release-scl-rh
# EPEL support (for yasm)
yum -y install https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm
# EPEL support (for yasm) (localinstall to avoid error code if already installed - as for Rocm container)
yum -y localinstall https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm
YASM=yasm
elif [ "${AUDITWHEEL_ARCH}" == "aarch64" ] || [ "${AUDITWHEEL_ARCH}" == "ppc64le" ] || [ "${AUDITWHEEL_ARCH}" == "s390x" ]; then
# Software collection (for devtoolset-$1)

View file

@ -5,8 +5,9 @@ SCRIPT_DIR="$( dirname "${BASH_SOURCE[0]}" )"
INSTALL_DEPS_TRAINING=false
INSTALL_DEPS_DISTRIBUTED_SETUP=false
ORTMODULE_BUILD=false
TARGET_ROCM=false
while getopts p:d:tmu parameter_Option
while getopts p:d:tmur parameter_Option
do case "${parameter_Option}"
in
p) PYTHON_VER=${OPTARG};;
@ -14,6 +15,7 @@ d) DEVICE_TYPE=${OPTARG};;
t) INSTALL_DEPS_TRAINING=true;;
m) INSTALL_DEPS_DISTRIBUTED_SETUP=true;;
u) ORTMODULE_BUILD=true;;
r) TARGET_ROCM=true;;
esac
done
@ -121,9 +123,24 @@ if [ $DEVICE_TYPE = "gpu" ]; then
if [[ $ORTMODULE_BUILD = false ]]; then
${PYTHON_EXE} -m pip install -r ${0/%install_deps.sh/training\/requirements.txt}
else
${PYTHON_EXE} -m pip install -r ${0/%install_deps.sh/training\/ortmodule\/stage1\/requirements.txt}
# Due to a [bug on DeepSpeed](https://github.com/microsoft/DeepSpeed/issues/663), we install it separately through ortmodule/stage2/requirements.txt
${PYTHON_EXE} -m pip install -r ${0/%install_deps.sh/training\/ortmodule\/stage2\/requirements.txt}
if [[ $TARGET_ROCM = false ]]; then
${PYTHON_EXE} -m pip install -r ${0/%install_deps.sh/training\/ortmodule\/stage1\/requirements.txt}
# Due to a [bug on DeepSpeed](https://github.com/microsoft/DeepSpeed/issues/663), we install it separately through ortmodule/stage2/requirements.txt
${PYTHON_EXE} -m pip install -r ${0/%install_deps.sh/training\/ortmodule\/stage2\/requirements.txt}
else
${PYTHON_EXE} -m pip install \
--pre -f https://download.pytorch.org/whl/nightly/rocm4.1/torch_nightly.html \
torch torchvision torchtext
${PYTHON_EXE} -m pip install -r ${0/%install_deps.sh/training\/ortmodule\/stage1\/requirements-rocm.txt}
${PYTHON_EXE} -m pip install fairscale
# remove triton requirement from getting triggered in requirements-sparse_attn.txt
git clone https://github.com/ROCmSoftwarePlatform/DeepSpeed
cd DeepSpeed &&\
rm requirements/requirements-sparse_attn.txt &&\
${PYTHON_EXE} setup.py bdist_wheel &&\
${PYTHON_EXE} -m pip install dist/deepspeed*.whl &&\
cd ..
fi
fi
fi
if [[ $INSTALL_DEPS_DISTRIBUTED_SETUP = true ]]; then

View file

@ -0,0 +1,8 @@
# transformers requires sklearn
pandas
sklearn
numpy==1.19.5
transformers==v4.3.2
tensorboard
h5py
wget