From 7a3c1787af012916fbabd109ffdf389dcde071d4 Mon Sep 17 00:00:00 2001 From: Suffian Khan Date: Fri, 23 Apr 2021 17:22:31 -0700 Subject: [PATCH] Add CI pipeline to publish Python training package targeting Rocm (#7417) * first attempt rocm training wheel * modifications needed to python packaging pipeline for Rocm 4.1 * changges to not conflict with cuda missed stage1 changes remove package push add option r to getopt try again without python install try again without python install try again without python install split pipelines and add back push to remote storage try on cuda gpu pool try again try again try running without az subscription set try again on original pipeline change pool passing AMD Rocm whl on AMD-GPU pool split rocm pipeline from cuda pipeline remove comments * try adding Rocm tests as well * try with tests in place * fix trailing ws * add training data * try again as root for tests * use python3 * typo * try to map video, render group into container * try again * try again * try to avoid yum error code * make UID 1001 * try without yum downgrade * define rocm_version=None * remove CUDA related comments for Rocm Dockerfile * Dont pin nightly torch torchvision torchtext versions as they expire (for now nightly is required for Rocm 4.1) * missed requirements-rocm.txt from last commit * fix whitespace --- setup.py | 10 + tools/ci_build/build.py | 13 +- ...orttraining-py-packaging-pipeline-rocm.yml | 14 + .../orttraining-py-packaging-pipeline.yml | 1 + .../templates/py-packaging-stage.yml | 255 ++++++++++++++++++ .../docker/Dockerfile.manylinux2014_rocm | 36 +++ .../manylinux2014_build_scripts/build.sh | 4 +- .../linux/docker/scripts/install_deps.sh | 25 +- .../ortmodule/stage1/requirements-rocm.txt | 8 + 9 files changed, 359 insertions(+), 7 deletions(-) create mode 100644 tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-rocm.yml create mode 100644 tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_rocm create mode 100644 tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements-rocm.txt diff --git a/setup.py b/setup.py index dec108f6e9..3126ca81dc 100644 --- a/setup.py +++ b/setup.py @@ -47,12 +47,16 @@ if parse_arg_remove_boolean(sys.argv, '--nightly_build'): wheel_name_suffix = parse_arg_remove_string(sys.argv, '--wheel_name_suffix=') cuda_version = None +rocm_version = None # The following arguments are mutually exclusive if parse_arg_remove_boolean(sys.argv, '--use_tensorrt'): package_name = 'onnxruntime-gpu-tensorrt' if not nightly_build else 'ort-trt-nightly' elif parse_arg_remove_boolean(sys.argv, '--use_cuda'): package_name = 'onnxruntime-gpu' if not nightly_build else 'ort-gpu-nightly' cuda_version = parse_arg_remove_string(sys.argv, '--cuda_version=') +elif parse_arg_remove_boolean(sys.argv, '--use_rocm'): + package_name = 'onnxruntime-rocm' if not nightly_build else 'ort-rocm-nightly' + rocm_version = parse_arg_remove_string(sys.argv, '--rocm_version=') elif parse_arg_remove_boolean(sys.argv, '--use_openvino'): package_name = 'onnxruntime-openvino' elif parse_arg_remove_boolean(sys.argv, '--use_dnnl'): @@ -131,6 +135,7 @@ try: copyfile(source, dest) result = subprocess.run(['patchelf', '--print-needed', dest], check=True, stdout=subprocess.PIPE, universal_newlines=True) cuda_dependencies = ['libcublas.so', 'libcudnn.so', 'libcudart.so', 'libcurand.so', 'libcufft.so', 'libnvToolsExt.so'] + cuda_dependencies.extend(['librccl.so', 'libamdhip64.so', 'librocblas.so', 'libMIOpen.so', 'libhsa-runtime64.so', 'libhsakmt.so']) to_preload = [] args = ['patchelf', '--debug'] for line in result.stdout.split('\n'): @@ -255,6 +260,11 @@ if enable_training: # removing '.' to make Cuda version number in the same form as Pytorch. cuda_version = cuda_version.replace('.', '') local_version = '+cu' + cuda_version + if rocm_version: + # removing '.' to make Cuda version number in the same form as Pytorch. + rocm_version = rocm_version.replace('.', '') + local_version = '+rocm' + rocm_version + package_data = {} data_files = [] diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py index 746b2d3a01..d83a700e0f 100644 --- a/tools/ci_build/build.py +++ b/tools/ci_build/build.py @@ -499,6 +499,8 @@ def parse_arguments(): parser.add_argument("--disable_ort_format_load", action='store_true', help='Disable support for loading ORT format models in a non-minimal build.') + parser.add_argument( + "--rocm_version", help="The version of ROCM stack to use. ") parser.add_argument("--use_rocm", action='store_true', help="Build with ROCm") parser.add_argument("--rocm_home", help="Path to ROCm installation dir") @@ -642,6 +644,7 @@ def generate_build_tree(cmake_path, source_dir, build_dir, cuda_home, cudnn_home "-DPYTHON_EXECUTABLE=" + sys.executable, "-Donnxruntime_USE_CUDA=" + ("ON" if args.use_cuda else "OFF"), "-Donnxruntime_CUDA_VERSION=" + (args.cuda_version if args.use_cuda else ""), + "-Donnxruntime_ROCM_VERSION=" + (args.rocm_version if args.use_rocm else ""), "-Donnxruntime_CUDA_HOME=" + (cuda_home if args.use_cuda else ""), "-Donnxruntime_CUDNN_HOME=" + (cudnn_home if args.use_cuda else ""), "-Donnxruntime_USE_FEATURIZERS=" + ("ON" if args.use_featurizers else "OFF"), @@ -1489,7 +1492,7 @@ def run_nodejs_tests(nodejs_binding_dir): def build_python_wheel( - source_dir, build_dir, configs, use_cuda, cuda_version, use_dnnl, + source_dir, build_dir, configs, use_cuda, cuda_version, use_rocm, rocm_version, use_dnnl, use_tensorrt, use_openvino, use_nuphar, use_vitisai, use_acl, use_armnn, use_dml, wheel_name_suffix, enable_training, nightly_build=False, featurizers_build=False, use_ninja=False): for config in configs: @@ -1527,6 +1530,10 @@ def build_python_wheel( args.append('--use_cuda') if cuda_version: args.append('--cuda_version={}'.format(cuda_version)) + elif use_rocm: + args.append('--use_rocm') + if rocm_version: + args.append('--rocm_version={}'.format(rocm_version)) elif use_openvino: args.append('--use_openvino') elif use_dnnl: @@ -1999,6 +2006,8 @@ def main(): raise BuildError("cuda_version must be specified on Windows.") else: args.cuda_version = "" + if args.use_rocm and args.rocm_version is None: + args.rocm_version = "" generate_build_tree( cmake_path, source_dir, build_dir, cuda_home, cudnn_home, rocm_home, mpi_home, nccl_home, tensorrt_home, migraphx_home, acl_home, acl_libs, armnn_home, armnn_libs, @@ -2037,6 +2046,8 @@ def main(): configs, args.use_cuda, args.cuda_version, + args.use_rocm, + args.rocm_version, args.use_dnnl, args.use_tensorrt, args.use_openvino, diff --git a/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-rocm.yml b/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-rocm.yml new file mode 100644 index 0000000000..cbd4ac3354 --- /dev/null +++ b/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-rocm.yml @@ -0,0 +1,14 @@ +trigger: none + +stages: +- template: templates/py-packaging-stage.yml + parameters: + build_py_parameters: --enable_training + enable_linux_cpu: false + enable_linux_gpu: false + enable_linux_gpu_training: false + enable_linux_rocm_training: true + enable_windows_cpu: false + enable_windows_gpu: false + enable_mac_cpu: false + enable_linux_arm: false diff --git a/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline.yml index c5568e5381..3b27c39a63 100644 --- a/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline.yml @@ -7,6 +7,7 @@ stages: enable_linux_cpu: false enable_linux_gpu: false enable_linux_gpu_training: true + enable_linux_rocm_training: false enable_windows_cpu: false enable_windows_gpu: false enable_mac_cpu: false diff --git a/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml index 21ea3fa532..f19b916a64 100644 --- a/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml +++ b/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml @@ -20,6 +20,11 @@ parameters: type: boolean default: false +- name: enable_linux_rocm_training + displayName: 'Whether Linux ROCM package is built.' + type: boolean + default: false + - name: enable_windows_cpu displayName: 'Whether Windows CPU package is built.' type: boolean @@ -217,6 +222,256 @@ stages: - template: clean-agent-build-directory-step.yml + - ${{ if eq(parameters.enable_linux_rocm_training, true) }}: + - job: Linux_py_ROCM_Wheels + timeoutInMinutes: 180 + workspace: + clean: all + pool: AMD-GPU + # pool: Onnxruntime-Linux-GPU + strategy: + matrix: + Python36: + PythonVersion: '3.6' + Python37: + PythonVersion: '3.7' + Python38: + PythonVersion: '3.8' + # dependency PyTorch does not support Python 3.9 yet + # Python39: + # PythonVersion: '3.9' + steps: + + - checkout: self + clean: true + submodules: recursive + + - template: set-python-manylinux-variables-step.yml + + - template: get-docker-image-steps.yml + parameters: + Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_rocm + Context: tools/ci_build/github/linux/docker + DockerBuildArgs: >- + --build-arg PYTHON_VERSION=$(PythonVersion) + --build-arg INSTALL_DEPS_EXTRA_ARGS=-tmur + --build-arg BUILD_UID=$(id -u) + Repository: onnxruntimetrainingrocmbuild + + - task: CmdLine@2 + inputs: + script: | + docker run --rm \ + --privileged \ + --ipc=host \ + --network=host \ + --cap-add=SYS_PTRACE \ + --security-opt seccomp=unconfined \ + -e CC=/opt/rh/devtoolset-8/root/usr/bin/cc -e CXX=/opt/rh/devtoolset-8/root/usr/bin/c++ -e CFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" -e CXXFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" \ + --volume $(Build.SourcesDirectory):/onnxruntime_src \ + --volume $(Build.BinariesDirectory):/build \ + --workdir /onnxruntime_src \ + --entrypoint $(PythonManylinuxDir)/bin/python3 \ + -e NVIDIA_VISIBLE_DEVICES=all \ + -e NIGHTLY_BUILD \ + -e BUILD_BUILDNUMBER \ + --user onnxruntimedev \ + onnxruntimetrainingrocmbuild \ + /onnxruntime_src/tools/ci_build/build.py \ + --config Release \ + --use_rocm \ + --rocm_version=4.1 \ + --rocm_home=/opt/rocm \ + --nccl_home=/opt/rocm \ + --update \ + --parallel \ + --build_dir /build \ + --build \ + --build_wheel \ + --skip_tests \ + ${{ parameters.build_py_parameters }} \ + --cmake_extra_defines PYTHON_INCLUDE_DIR=$(PythonManylinuxIncludeDir) PYTHON_LIBRARY=/usr/lib64/librt.so \ + workingDirectory: $(Build.SourcesDirectory) + displayName: 'Build onnxruntime (in container)' + + - script: |- + python3 orttraining/tools/ci_test/download_azure_blob_archive.py \ + --azure_blob_url https://onnxruntimetestdata.blob.core.windows.net/training/onnxruntime_training_data.zip?snapshot=2020-06-15T23:17:35.8314853Z \ + --target_dir $(Build.SourcesDirectory)/training_e2e_test_data \ + --archive_sha256_digest B01C169B6550D1A0A6F1B4E2F34AE2A8714B52DBB70AC04DA85D371F691BDFF9 + displayName: 'Download onnxruntime_training_data.zip data' + + - script: |- + echo "Tests will run using HIP_VISIBLES_DEVICES=$HIP_VISIBLE_DEVICES" + video_gid=$(getent group | awk '/video/ {split($0,a,":"); print(a[3])}') + echo "##vso[task.setvariable variable=video]$video_gid" + render_gid=$(getent group | awk '/render/ {split($0,a,":"); print(a[3])}') + echo "##vso[task.setvariable variable=render]$render_gid" + displayName: 'Find video and render gid to be mapped into container' + + - script: |- + echo "video=$video" + echo "render=$render" + docker run --rm \ + --device=/dev/kfd \ + --device=/dev/dri \ + --group-add $(video) \ + --group-add $(render) \ + --privileged \ + --ipc=host \ + --network=host \ + --cap-add=SYS_PTRACE \ + --security-opt seccomp=unconfined \ + --volume $(Build.SourcesDirectory):/onnxruntime_src \ + --volume $(Build.BinariesDirectory):/build \ + --workdir /build/Release \ + --entrypoint /bin/bash \ + -e HIP_VISIBLE_DEVICES \ + -e NIGHTLY_BUILD \ + -e BUILD_BUILDNUMBER \ + --user onnxruntimedev \ + onnxruntimetrainingrocmbuild \ + /onnxruntime_src/tools/ci_build/github/pai/pai_test_launcher.sh + displayName: 'Run onnxruntime unit tests (in container)' + + - script: |- + docker run --rm \ + --device=/dev/kfd \ + --device=/dev/dri \ + --group-add $(video) \ + --group-add $(render) \ + --privileged \ + --ipc=host \ + --network=host \ + --cap-add=SYS_PTRACE \ + --security-opt seccomp=unconfined \ + --volume $(Build.SourcesDirectory):/onnxruntime_src \ + --volume $(Build.BinariesDirectory):/build \ + --workdir /onnxruntime_src \ + --entrypoint $(PythonManylinuxDir)/bin/python3 \ + -e HIP_VISIBLE_DEVICES \ + -e NIGHTLY_BUILD \ + -e BUILD_BUILDNUMBER \ + --user onnxruntimedev \ + onnxruntimetrainingrocmbuild \ + orttraining/tools/ci_test/run_batch_size_test.py \ + --binary_dir /build/Release \ + --model_root training_e2e_test_data/models \ + --gpu_sku MI100_32G + displayName: 'Run C++ BERT-L batch size test (in container)' + condition: succeededOrFailed() # ensure all tests are run + + - script: |- + docker run --rm \ + --device=/dev/kfd \ + --device=/dev/dri \ + --group-add $(video) \ + --group-add $(render) \ + --privileged \ + --ipc=host \ + --network=host \ + --cap-add=SYS_PTRACE \ + --security-opt seccomp=unconfined \ + --volume $(Build.SourcesDirectory):/onnxruntime_src \ + --volume $(Build.BinariesDirectory):/build \ + --workdir /onnxruntime_src \ + --entrypoint $(PythonManylinuxDir)/bin/python3 \ + -e HIP_VISIBLE_DEVICES \ + -e NIGHTLY_BUILD \ + -e BUILD_BUILDNUMBER \ + --user onnxruntimedev \ + onnxruntimetrainingrocmbuild \ + orttraining/tools/ci_test/run_bert_perf_test.py \ + --binary_dir /build/Release \ + --model_root training_e2e_test_data/models \ + --training_data_root training_e2e_test_data/data \ + --gpu_sku MI100_32G + displayName: 'Run C++ BERT-L performance test (in container)' + condition: succeededOrFailed() # ensure all tests are run + + - script: |- + docker run --rm \ + --device=/dev/kfd \ + --device=/dev/dri \ + --group-add $(video) \ + --group-add $(render) \ + --privileged \ + --ipc=host \ + --network=host \ + --cap-add=SYS_PTRACE \ + --security-opt seccomp=unconfined \ + --volume $(Build.SourcesDirectory):/onnxruntime_src \ + --volume $(Build.BinariesDirectory):/build \ + --workdir /onnxruntime_src \ + --entrypoint $(PythonManylinuxDir)/bin/python3 \ + -e HIP_VISIBLE_DEVICES \ + -e NIGHTLY_BUILD \ + -e BUILD_BUILDNUMBER \ + --user onnxruntimedev \ + onnxruntimetrainingrocmbuild \ + orttraining/tools/ci_test/run_convergence_test.py \ + --binary_dir /build/Release \ + --model_root training_e2e_test_data/models \ + --training_data_root training_e2e_test_data/data \ + --gpu_sku MI100_32G + displayName: 'Run C++ BERT-L convergence test (in container)' + condition: succeededOrFailed() # ensure all tests are run + + - task: CopyFiles@2 + displayName: 'Copy Python Wheel to: $(Build.ArtifactStagingDirectory)' + inputs: + SourceFolder: '$(Build.BinariesDirectory)' + Contents: 'Release/dist/*.whl' + TargetFolder: '$(Build.ArtifactStagingDirectory)' + + - task: CmdLine@2 + displayName: 'Build Python Documentation' + condition: ne(variables['PythonVersion'], '3.9') # tensorflow not available on python 3.9 + inputs: + script: | + mkdir -p $HOME/.onnx + docker run --rm \ + --volume $(Build.SourcesDirectory):/onnxruntime_src \ + --volume $(Build.BinariesDirectory):/build \ + -e NIGHTLY_BUILD \ + -e BUILD_BUILDNUMBER \ + --entrypoint /bin/bash \ + onnxruntimetrainingrocmbuild \ + /onnxruntime_src/tools/doc/builddoc.sh $(PythonManylinuxDir)/bin/ /onnxruntime_src /build Release + workingDirectory: $(Build.SourcesDirectory) + + - task: CopyFiles@2 + displayName: 'Copy Python Documentation to: $(Build.ArtifactStagingDirectory)' + condition: ne(variables['PythonVersion'], '3.9') # tensorflow not available on python 3.9 + inputs: + SourceFolder: '$(Build.BinariesDirectory)/docs/training/html' + Contents: '**' + TargetFolder: '$(Build.ArtifactStagingDirectory)/training_html_doc' + + - task: PublishBuildArtifacts@1 + displayName: 'Upload Rocm wheel as build artifact' + inputs: + ArtifactName: onnxruntime_rocm + + - script: | + python3 -m pip install azure-storage-blob==2.1.0 + files=($(Build.ArtifactStagingDirectory)/Release/dist/*.whl) && \ + echo ${files[0]} && \ + python3 tools/ci_build/upload_python_package_to_azure_storage.py \ + --python_wheel_path ${files[0]} \ + --account_name onnxruntimepackages \ + --account_key $(orttrainingpackagestorageaccountkey) \ + --container_name '$web' + condition: and(succeeded(), eq(variables['DRY_RUN'], '0')) + displayName: 'Upload Rocm wheel to release repository' + + - template: component-governance-component-detection-steps.yml + parameters: + condition: 'succeeded' + + - template: clean-agent-build-directory-step.yml + + - ${{ if eq(parameters.enable_linux_gpu_training, true) }}: - job: Linux_py_GPU_Wheels timeoutInMinutes: 180 diff --git a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_rocm b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_rocm new file mode 100644 index 0000000000..889b5268fe --- /dev/null +++ b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_rocm @@ -0,0 +1,36 @@ +FROM rocm/pytorch:rocm4.1.1_centos7_py3.6_pytorch + +#Build manylinux2014 docker image begin +ENV AUDITWHEEL_ARCH x86_64 +ENV AUDITWHEEL_PLAT manylinux2014_$AUDITWHEEL_ARCH +ENV LC_ALL en_US.UTF-8 +ENV LANG en_US.UTF-8 +ENV LANGUAGE en_US.UTF-8 +ENV DEVTOOLSET_ROOTPATH /opt/rh/devtoolset-8/root +ENV PATH $DEVTOOLSET_ROOTPATH/usr/bin:$PATH +ENV LD_LIBRARY_PATH $DEVTOOLSET_ROOTPATH/usr/lib64:$DEVTOOLSET_ROOTPATH/usr/lib:$DEVTOOLSET_ROOTPATH/usr/lib64/dyninst:$DEVTOOLSET_ROOTPATH/usr/lib/dyninst:/usr/local/lib64:/usr/local/lib +ENV PKG_CONFIG_PATH /usr/local/lib/pkgconfig + +COPY manylinux2014_build_scripts /manylinux2014_build_scripts +RUN bash /manylinux2014_build_scripts/build.sh 8 && rm -r /manylinux2014_build_scripts + +ENV SSL_CERT_FILE=/opt/_internal/certs.pem + +#Build manylinux2014 docker image end + +ARG PYTHON_VERSION=3.6 +ARG INSTALL_DEPS_EXTRA_ARGS + +#Add our own dependencies +ADD scripts /tmp/scripts +RUN cd /tmp/scripts && \ + /tmp/scripts/install_centos.sh && \ + /tmp/scripts/install_deps.sh -d gpu -p $PYTHON_VERSION $INSTALL_DEPS_EXTRA_ARGS && \ + rm -rf /tmp/scripts + +ARG BUILD_UID=1001 +ARG BUILD_USER=onnxruntimedev +RUN adduser --uid $BUILD_UID $BUILD_USER +WORKDIR /home/$BUILD_USER +USER $BUILD_USER +ENV PATH /usr/local/gradle/bin:/usr/local/dotnet:$PATH diff --git a/tools/ci_build/github/linux/docker/manylinux2014_build_scripts/build.sh b/tools/ci_build/github/linux/docker/manylinux2014_build_scripts/build.sh index 9ea2b7ace7..947651bcb9 100755 --- a/tools/ci_build/github/linux/docker/manylinux2014_build_scripts/build.sh +++ b/tools/ci_build/github/linux/docker/manylinux2014_build_scripts/build.sh @@ -66,8 +66,8 @@ TOOLCHAIN_DEPS="devtoolset-$1-binutils devtoolset-$1-gcc devtoolset-$1-gcc-c++ d if [ "${AUDITWHEEL_ARCH}" == "x86_64" ]; then # Software collection (for devtoolset-$1) yum -y install centos-release-scl-rh - # EPEL support (for yasm) - yum -y install https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm + # EPEL support (for yasm) (localinstall to avoid error code if already installed - as for Rocm container) + yum -y localinstall https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm YASM=yasm elif [ "${AUDITWHEEL_ARCH}" == "aarch64" ] || [ "${AUDITWHEEL_ARCH}" == "ppc64le" ] || [ "${AUDITWHEEL_ARCH}" == "s390x" ]; then # Software collection (for devtoolset-$1) diff --git a/tools/ci_build/github/linux/docker/scripts/install_deps.sh b/tools/ci_build/github/linux/docker/scripts/install_deps.sh index 64a178402c..ebf6555b9e 100755 --- a/tools/ci_build/github/linux/docker/scripts/install_deps.sh +++ b/tools/ci_build/github/linux/docker/scripts/install_deps.sh @@ -5,8 +5,9 @@ SCRIPT_DIR="$( dirname "${BASH_SOURCE[0]}" )" INSTALL_DEPS_TRAINING=false INSTALL_DEPS_DISTRIBUTED_SETUP=false ORTMODULE_BUILD=false +TARGET_ROCM=false -while getopts p:d:tmu parameter_Option +while getopts p:d:tmur parameter_Option do case "${parameter_Option}" in p) PYTHON_VER=${OPTARG};; @@ -14,6 +15,7 @@ d) DEVICE_TYPE=${OPTARG};; t) INSTALL_DEPS_TRAINING=true;; m) INSTALL_DEPS_DISTRIBUTED_SETUP=true;; u) ORTMODULE_BUILD=true;; +r) TARGET_ROCM=true;; esac done @@ -121,9 +123,24 @@ if [ $DEVICE_TYPE = "gpu" ]; then if [[ $ORTMODULE_BUILD = false ]]; then ${PYTHON_EXE} -m pip install -r ${0/%install_deps.sh/training\/requirements.txt} else - ${PYTHON_EXE} -m pip install -r ${0/%install_deps.sh/training\/ortmodule\/stage1\/requirements.txt} - # Due to a [bug on DeepSpeed](https://github.com/microsoft/DeepSpeed/issues/663), we install it separately through ortmodule/stage2/requirements.txt - ${PYTHON_EXE} -m pip install -r ${0/%install_deps.sh/training\/ortmodule\/stage2\/requirements.txt} + if [[ $TARGET_ROCM = false ]]; then + ${PYTHON_EXE} -m pip install -r ${0/%install_deps.sh/training\/ortmodule\/stage1\/requirements.txt} + # Due to a [bug on DeepSpeed](https://github.com/microsoft/DeepSpeed/issues/663), we install it separately through ortmodule/stage2/requirements.txt + ${PYTHON_EXE} -m pip install -r ${0/%install_deps.sh/training\/ortmodule\/stage2\/requirements.txt} + else + ${PYTHON_EXE} -m pip install \ + --pre -f https://download.pytorch.org/whl/nightly/rocm4.1/torch_nightly.html \ + torch torchvision torchtext + ${PYTHON_EXE} -m pip install -r ${0/%install_deps.sh/training\/ortmodule\/stage1\/requirements-rocm.txt} + ${PYTHON_EXE} -m pip install fairscale + # remove triton requirement from getting triggered in requirements-sparse_attn.txt + git clone https://github.com/ROCmSoftwarePlatform/DeepSpeed + cd DeepSpeed &&\ + rm requirements/requirements-sparse_attn.txt &&\ + ${PYTHON_EXE} setup.py bdist_wheel &&\ + ${PYTHON_EXE} -m pip install dist/deepspeed*.whl &&\ + cd .. + fi fi fi if [[ $INSTALL_DEPS_DISTRIBUTED_SETUP = true ]]; then diff --git a/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements-rocm.txt b/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements-rocm.txt new file mode 100644 index 0000000000..06b4399109 --- /dev/null +++ b/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements-rocm.txt @@ -0,0 +1,8 @@ +# transformers requires sklearn +pandas +sklearn +numpy==1.19.5 +transformers==v4.3.2 +tensorboard +h5py +wget