Add CI pipeline to publish Python training package targeting Rocm (#7417)

* first attempt rocm training wheel * modifications needed to python packaging pipeline for Rocm 4.1 * changges to not conflict with cuda missed stage1 changes remove package push add option r to getopt try again without python install try again without python install try again without python install split pipelines and add back push to remote storage try on cuda gpu pool try again try again try running without az subscription set try again on original pipeline change pool passing AMD Rocm whl on AMD-GPU pool split rocm pipeline from cuda pipeline remove comments * try adding Rocm tests as well * try with tests in place * fix trailing ws * add training data * try again as root for tests * use python3 * typo * try to map video, render group into container * try again * try again * try to avoid yum error code * make UID 1001 * try without yum downgrade * define rocm_version=None * remove CUDA related comments for Rocm Dockerfile * Dont pin nightly torch torchvision torchtext versions as they expire (for now nightly is required for Rocm 4.1) * missed requirements-rocm.txt from last commit * fix whitespace
2026-07-12 17:57:38 +00:00 · 2021-04-23 17:22:31 -07:00 · 2021-04-23 17:22:31 -07:00 · 7a3c1787af
commit 7a3c1787af
parent 34ebf7d3dd
9 changed files with 359 additions and 7 deletions
--- a/setup.py
+++ b/setup.py
@ -47,12 +47,16 @@ if parse_arg_remove_boolean(sys.argv, '--nightly_build'):
 wheel_name_suffix = parse_arg_remove_string(sys.argv, '--wheel_name_suffix=')

 cuda_version = None
+rocm_version = None
 # The following arguments are mutually exclusive
 if parse_arg_remove_boolean(sys.argv, '--use_tensorrt'):
    package_name = 'onnxruntime-gpu-tensorrt' if not nightly_build else 'ort-trt-nightly'
 elif parse_arg_remove_boolean(sys.argv, '--use_cuda'):
    package_name = 'onnxruntime-gpu' if not nightly_build else 'ort-gpu-nightly'
    cuda_version = parse_arg_remove_string(sys.argv, '--cuda_version=')
+elif parse_arg_remove_boolean(sys.argv, '--use_rocm'):
+    package_name = 'onnxruntime-rocm' if not nightly_build else 'ort-rocm-nightly'
+    rocm_version = parse_arg_remove_string(sys.argv, '--rocm_version=')
 elif parse_arg_remove_boolean(sys.argv, '--use_openvino'):
    package_name = 'onnxruntime-openvino'
 elif parse_arg_remove_boolean(sys.argv, '--use_dnnl'):
@ -131,6 +135,7 @@ try:
                copyfile(source, dest)
                result = subprocess.run(['patchelf', '--print-needed', dest], check=True, stdout=subprocess.PIPE, universal_newlines=True)
                cuda_dependencies = ['libcublas.so', 'libcudnn.so', 'libcudart.so', 'libcurand.so', 'libcufft.so', 'libnvToolsExt.so']
+                cuda_dependencies.extend(['librccl.so', 'libamdhip64.so', 'librocblas.so', 'libMIOpen.so', 'libhsa-runtime64.so', 'libhsakmt.so'])
                to_preload = []
                args = ['patchelf', '--debug']
                for line in result.stdout.split('\n'):
@ -255,6 +260,11 @@ if enable_training:
        # removing '.' to make Cuda version number in the same form as Pytorch.
        cuda_version = cuda_version.replace('.', '')
        local_version = '+cu' + cuda_version
+    if rocm_version:
+        # removing '.' to make Cuda version number in the same form as Pytorch.
+        rocm_version = rocm_version.replace('.', '')
+        local_version = '+rocm' + rocm_version
+

 package_data = {}
 data_files = []
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@ -499,6 +499,8 @@ def parse_arguments():
    parser.add_argument("--disable_ort_format_load", action='store_true',
                        help='Disable support for loading ORT format models in a non-minimal build.')

+    parser.add_argument(
+        "--rocm_version", help="The version of ROCM stack to use. ")
    parser.add_argument("--use_rocm", action='store_true', help="Build with ROCm")
    parser.add_argument("--rocm_home", help="Path to ROCm installation dir")

@ -642,6 +644,7 @@ def generate_build_tree(cmake_path, source_dir, build_dir, cuda_home, cudnn_home
        "-DPYTHON_EXECUTABLE=" + sys.executable,
        "-Donnxruntime_USE_CUDA=" + ("ON" if args.use_cuda else "OFF"),
        "-Donnxruntime_CUDA_VERSION=" + (args.cuda_version if args.use_cuda else ""),
+        "-Donnxruntime_ROCM_VERSION=" + (args.rocm_version if args.use_rocm else ""),
        "-Donnxruntime_CUDA_HOME=" + (cuda_home if args.use_cuda else ""),
        "-Donnxruntime_CUDNN_HOME=" + (cudnn_home if args.use_cuda else ""),
        "-Donnxruntime_USE_FEATURIZERS=" + ("ON" if args.use_featurizers else "OFF"),
@ -1489,7 +1492,7 @@ def run_nodejs_tests(nodejs_binding_dir):


 def build_python_wheel(
-        source_dir, build_dir, configs, use_cuda, cuda_version, use_dnnl,
+        source_dir, build_dir, configs, use_cuda, cuda_version, use_rocm, rocm_version, use_dnnl,
        use_tensorrt, use_openvino, use_nuphar, use_vitisai, use_acl, use_armnn, use_dml,
        wheel_name_suffix, enable_training, nightly_build=False, featurizers_build=False, use_ninja=False):
    for config in configs:
@ -1527,6 +1530,10 @@ def build_python_wheel(
            args.append('--use_cuda')
            if cuda_version:
                args.append('--cuda_version={}'.format(cuda_version))
+        elif use_rocm:
+            args.append('--use_rocm')
+            if rocm_version:
+                args.append('--rocm_version={}'.format(rocm_version))
        elif use_openvino:
            args.append('--use_openvino')
        elif use_dnnl:
@ -1999,6 +2006,8 @@ def main():
                raise BuildError("cuda_version must be specified on Windows.")
            else:
                args.cuda_version = ""
+        if args.use_rocm and args.rocm_version is None:
+            args.rocm_version = ""
        generate_build_tree(
            cmake_path, source_dir, build_dir, cuda_home, cudnn_home, rocm_home, mpi_home, nccl_home,
            tensorrt_home, migraphx_home, acl_home, acl_libs, armnn_home, armnn_libs,
@ -2037,6 +2046,8 @@ def main():
                configs,
                args.use_cuda,
                args.cuda_version,
+                args.use_rocm,
+                args.rocm_version,
                args.use_dnnl,
                args.use_tensorrt,
                args.use_openvino,
--- a/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-rocm.yml
+++ b/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-rocm.yml
@ -0,0 +1,14 @@
+trigger: none
+
+stages:
+- template: templates/py-packaging-stage.yml
+  parameters:
+    build_py_parameters: --enable_training
+    enable_linux_cpu: false
+    enable_linux_gpu: false
+    enable_linux_gpu_training: false
+    enable_linux_rocm_training: true
+    enable_windows_cpu: false
+    enable_windows_gpu: false
+    enable_mac_cpu: false
+    enable_linux_arm: false
--- a/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline.yml
@ -7,6 +7,7 @@ stages:
    enable_linux_cpu: false
    enable_linux_gpu: false
    enable_linux_gpu_training: true
+    enable_linux_rocm_training: false
    enable_windows_cpu: false
    enable_windows_gpu: false
    enable_mac_cpu: false
--- a/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml
@ -20,6 +20,11 @@ parameters:
  type: boolean
  default: false

+- name: enable_linux_rocm_training
+  displayName: 'Whether Linux ROCM package is built.'
+  type: boolean
+  default: false
+
 - name: enable_windows_cpu
  displayName: 'Whether Windows CPU package is built.'
  type: boolean
@ -217,6 +222,256 @@ stages:

      - template: clean-agent-build-directory-step.yml

+  - ${{ if eq(parameters.enable_linux_rocm_training, true) }}:
+    - job: Linux_py_ROCM_Wheels
+      timeoutInMinutes: 180
+      workspace:
+        clean: all
+      pool: AMD-GPU
+      # pool: Onnxruntime-Linux-GPU
+      strategy:
+        matrix:
+          Python36:
+            PythonVersion: '3.6'
+          Python37:
+            PythonVersion: '3.7'
+          Python38:
+            PythonVersion: '3.8'
+          # dependency PyTorch does not support Python 3.9 yet
+          # Python39:
+          #   PythonVersion: '3.9'
+      steps:
+
+      - checkout: self
+        clean: true
+        submodules: recursive
+
+      - template: set-python-manylinux-variables-step.yml
+
+      - template: get-docker-image-steps.yml
+        parameters:
+          Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_rocm
+          Context: tools/ci_build/github/linux/docker
+          DockerBuildArgs: >-
+            --build-arg PYTHON_VERSION=$(PythonVersion)
+            --build-arg INSTALL_DEPS_EXTRA_ARGS=-tmur
+            --build-arg BUILD_UID=$(id -u)
+          Repository: onnxruntimetrainingrocmbuild
+
+      - task: CmdLine@2
+        inputs:
+          script: |
+            docker run --rm \
+              --privileged \
+              --ipc=host \
+              --network=host \
+              --cap-add=SYS_PTRACE \
+              --security-opt seccomp=unconfined \
+              -e CC=/opt/rh/devtoolset-8/root/usr/bin/cc -e CXX=/opt/rh/devtoolset-8/root/usr/bin/c++ -e CFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" -e CXXFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" \
+              --volume $(Build.SourcesDirectory):/onnxruntime_src \
+              --volume $(Build.BinariesDirectory):/build \
+              --workdir /onnxruntime_src \
+              --entrypoint $(PythonManylinuxDir)/bin/python3 \
+              -e NVIDIA_VISIBLE_DEVICES=all \
+              -e NIGHTLY_BUILD \
+              -e BUILD_BUILDNUMBER \
+              --user onnxruntimedev \
+              onnxruntimetrainingrocmbuild \
+                /onnxruntime_src/tools/ci_build/build.py \
+                  --config Release \
+                  --use_rocm \
+                    --rocm_version=4.1 \
+                    --rocm_home=/opt/rocm \
+                    --nccl_home=/opt/rocm \
+                  --update \
+                  --parallel \
+                  --build_dir /build \
+                  --build \
+                  --build_wheel \
+                  --skip_tests \
+                  ${{ parameters.build_py_parameters }} \
+                  --cmake_extra_defines  PYTHON_INCLUDE_DIR=$(PythonManylinuxIncludeDir) PYTHON_LIBRARY=/usr/lib64/librt.so \
+          workingDirectory: $(Build.SourcesDirectory)
+        displayName: 'Build onnxruntime (in container)'
+
+      - script: |-
+          python3 orttraining/tools/ci_test/download_azure_blob_archive.py \
+            --azure_blob_url https://onnxruntimetestdata.blob.core.windows.net/training/onnxruntime_training_data.zip?snapshot=2020-06-15T23:17:35.8314853Z \
+            --target_dir $(Build.SourcesDirectory)/training_e2e_test_data \
+            --archive_sha256_digest B01C169B6550D1A0A6F1B4E2F34AE2A8714B52DBB70AC04DA85D371F691BDFF9
+        displayName: 'Download onnxruntime_training_data.zip data'
+
+      - script: |-
+          echo "Tests will run using HIP_VISIBLES_DEVICES=$HIP_VISIBLE_DEVICES"
+          video_gid=$(getent group | awk '/video/ {split($0,a,":"); print(a[3])}')
+          echo "##vso[task.setvariable variable=video]$video_gid"
+          render_gid=$(getent group | awk '/render/ {split($0,a,":"); print(a[3])}')
+          echo "##vso[task.setvariable variable=render]$render_gid"
+        displayName: 'Find video and render gid to be mapped into container'
+ 
+      - script: |-
+          echo "video=$video"
+          echo "render=$render"
+          docker run --rm \
+            --device=/dev/kfd \
+            --device=/dev/dri \
+            --group-add $(video) \
+            --group-add $(render) \
+            --privileged \
+            --ipc=host \
+            --network=host \
+            --cap-add=SYS_PTRACE \
+            --security-opt seccomp=unconfined \
+            --volume $(Build.SourcesDirectory):/onnxruntime_src \
+            --volume $(Build.BinariesDirectory):/build \
+            --workdir /build/Release \
+            --entrypoint /bin/bash \
+            -e HIP_VISIBLE_DEVICES \
+            -e NIGHTLY_BUILD \
+            -e BUILD_BUILDNUMBER \
+            --user onnxruntimedev \
+            onnxruntimetrainingrocmbuild \
+               /onnxruntime_src/tools/ci_build/github/pai/pai_test_launcher.sh
+        displayName: 'Run onnxruntime unit tests (in container)'
+      
+      - script: |-
+          docker run --rm \
+            --device=/dev/kfd \
+            --device=/dev/dri \
+            --group-add $(video) \
+            --group-add $(render) \
+            --privileged \
+            --ipc=host \
+            --network=host \
+            --cap-add=SYS_PTRACE \
+            --security-opt seccomp=unconfined \
+            --volume $(Build.SourcesDirectory):/onnxruntime_src \
+            --volume $(Build.BinariesDirectory):/build \
+            --workdir /onnxruntime_src \
+            --entrypoint $(PythonManylinuxDir)/bin/python3 \
+            -e HIP_VISIBLE_DEVICES \
+            -e NIGHTLY_BUILD \
+            -e BUILD_BUILDNUMBER \
+            --user onnxruntimedev \
+            onnxruntimetrainingrocmbuild \
+              orttraining/tools/ci_test/run_batch_size_test.py \
+                --binary_dir /build/Release \
+                --model_root training_e2e_test_data/models \
+                --gpu_sku MI100_32G
+        displayName: 'Run C++ BERT-L batch size test (in container)'
+        condition: succeededOrFailed() # ensure all tests are run
+      
+      - script: |-
+          docker run --rm \
+            --device=/dev/kfd \
+            --device=/dev/dri \
+            --group-add $(video) \
+            --group-add $(render) \
+            --privileged \
+            --ipc=host \
+            --network=host \
+            --cap-add=SYS_PTRACE \
+            --security-opt seccomp=unconfined \
+            --volume $(Build.SourcesDirectory):/onnxruntime_src \
+            --volume $(Build.BinariesDirectory):/build \
+            --workdir /onnxruntime_src \
+            --entrypoint $(PythonManylinuxDir)/bin/python3 \
+            -e HIP_VISIBLE_DEVICES \
+            -e NIGHTLY_BUILD \
+            -e BUILD_BUILDNUMBER \
+            --user onnxruntimedev \
+            onnxruntimetrainingrocmbuild \
+              orttraining/tools/ci_test/run_bert_perf_test.py \
+                --binary_dir /build/Release \
+                --model_root training_e2e_test_data/models \
+                --training_data_root training_e2e_test_data/data \
+                --gpu_sku MI100_32G
+        displayName: 'Run C++ BERT-L performance test (in container)'
+        condition: succeededOrFailed() # ensure all tests are run
+      
+      - script: |-
+          docker run --rm \
+            --device=/dev/kfd \
+            --device=/dev/dri \
+            --group-add $(video) \
+            --group-add $(render) \
+            --privileged \
+            --ipc=host \
+            --network=host \
+            --cap-add=SYS_PTRACE \
+            --security-opt seccomp=unconfined \
+            --volume $(Build.SourcesDirectory):/onnxruntime_src \
+            --volume $(Build.BinariesDirectory):/build \
+            --workdir /onnxruntime_src \
+            --entrypoint $(PythonManylinuxDir)/bin/python3 \
+            -e HIP_VISIBLE_DEVICES \
+            -e NIGHTLY_BUILD \
+            -e BUILD_BUILDNUMBER \
+            --user onnxruntimedev \
+            onnxruntimetrainingrocmbuild \
+              orttraining/tools/ci_test/run_convergence_test.py \
+                --binary_dir /build/Release \
+                --model_root training_e2e_test_data/models \
+                --training_data_root training_e2e_test_data/data \
+                --gpu_sku MI100_32G
+        displayName: 'Run C++ BERT-L convergence test (in container)'
+        condition: succeededOrFailed() # ensure all tests are run
+      
+      - task: CopyFiles@2
+        displayName: 'Copy Python Wheel to: $(Build.ArtifactStagingDirectory)'
+        inputs:
+          SourceFolder: '$(Build.BinariesDirectory)'
+          Contents: 'Release/dist/*.whl'
+          TargetFolder: '$(Build.ArtifactStagingDirectory)'
+
+      - task: CmdLine@2
+        displayName: 'Build Python Documentation'
+        condition: ne(variables['PythonVersion'], '3.9')  # tensorflow not available on python 3.9
+        inputs:
+          script: |
+            mkdir -p $HOME/.onnx
+            docker run --rm \
+              --volume $(Build.SourcesDirectory):/onnxruntime_src \
+              --volume $(Build.BinariesDirectory):/build \
+              -e NIGHTLY_BUILD \
+              -e BUILD_BUILDNUMBER \
+              --entrypoint /bin/bash \
+              onnxruntimetrainingrocmbuild \
+                /onnxruntime_src/tools/doc/builddoc.sh $(PythonManylinuxDir)/bin/ /onnxruntime_src /build Release
+          workingDirectory: $(Build.SourcesDirectory)
+
+      - task: CopyFiles@2
+        displayName: 'Copy Python Documentation to: $(Build.ArtifactStagingDirectory)'
+        condition: ne(variables['PythonVersion'], '3.9')  # tensorflow not available on python 3.9
+        inputs:
+          SourceFolder: '$(Build.BinariesDirectory)/docs/training/html'
+          Contents: '**'
+          TargetFolder: '$(Build.ArtifactStagingDirectory)/training_html_doc'
+
+      - task: PublishBuildArtifacts@1
+        displayName: 'Upload Rocm wheel as build artifact'
+        inputs:
+          ArtifactName: onnxruntime_rocm
+
+      - script: |
+          python3 -m pip install azure-storage-blob==2.1.0
+          files=($(Build.ArtifactStagingDirectory)/Release/dist/*.whl) && \
+          echo ${files[0]} && \
+          python3 tools/ci_build/upload_python_package_to_azure_storage.py \
+              --python_wheel_path ${files[0]} \
+              --account_name onnxruntimepackages \
+              --account_key $(orttrainingpackagestorageaccountkey) \
+              --container_name '$web'
+        condition: and(succeeded(), eq(variables['DRY_RUN'], '0'))
+        displayName: 'Upload Rocm wheel to release repository'
+
+      - template: component-governance-component-detection-steps.yml
+        parameters:
+          condition: 'succeeded'
+
+      - template: clean-agent-build-directory-step.yml
+
+
  - ${{ if eq(parameters.enable_linux_gpu_training, true) }}:
    - job: Linux_py_GPU_Wheels
      timeoutInMinutes: 180
--- a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_rocm
+++ b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_rocm
@ -0,0 +1,36 @@
+FROM rocm/pytorch:rocm4.1.1_centos7_py3.6_pytorch
+
+#Build manylinux2014 docker image begin
+ENV AUDITWHEEL_ARCH x86_64
+ENV AUDITWHEEL_PLAT manylinux2014_$AUDITWHEEL_ARCH
+ENV LC_ALL en_US.UTF-8
+ENV LANG en_US.UTF-8
+ENV LANGUAGE en_US.UTF-8
+ENV DEVTOOLSET_ROOTPATH /opt/rh/devtoolset-8/root
+ENV PATH $DEVTOOLSET_ROOTPATH/usr/bin:$PATH
+ENV LD_LIBRARY_PATH $DEVTOOLSET_ROOTPATH/usr/lib64:$DEVTOOLSET_ROOTPATH/usr/lib:$DEVTOOLSET_ROOTPATH/usr/lib64/dyninst:$DEVTOOLSET_ROOTPATH/usr/lib/dyninst:/usr/local/lib64:/usr/local/lib
+ENV PKG_CONFIG_PATH /usr/local/lib/pkgconfig
+
+COPY manylinux2014_build_scripts /manylinux2014_build_scripts
+RUN bash /manylinux2014_build_scripts/build.sh 8 && rm -r /manylinux2014_build_scripts 
+
+ENV SSL_CERT_FILE=/opt/_internal/certs.pem
+
+#Build manylinux2014 docker image end
+
+ARG PYTHON_VERSION=3.6
+ARG INSTALL_DEPS_EXTRA_ARGS
+
+#Add our own dependencies
+ADD scripts /tmp/scripts
+RUN cd /tmp/scripts && \
+    /tmp/scripts/install_centos.sh && \
+    /tmp/scripts/install_deps.sh -d gpu -p $PYTHON_VERSION $INSTALL_DEPS_EXTRA_ARGS && \
+    rm -rf /tmp/scripts
+
+ARG BUILD_UID=1001
+ARG BUILD_USER=onnxruntimedev
+RUN adduser --uid $BUILD_UID $BUILD_USER
+WORKDIR /home/$BUILD_USER
+USER $BUILD_USER
+ENV PATH /usr/local/gradle/bin:/usr/local/dotnet:$PATH
--- a/tools/ci_build/github/linux/docker/manylinux2014_build_scripts/build.sh
+++ b/tools/ci_build/github/linux/docker/manylinux2014_build_scripts/build.sh
@ -66,8 +66,8 @@ TOOLCHAIN_DEPS="devtoolset-$1-binutils devtoolset-$1-gcc devtoolset-$1-gcc-c++ d
 if [ "${AUDITWHEEL_ARCH}" == "x86_64" ]; then
    # Software collection (for devtoolset-$1)
    yum -y install centos-release-scl-rh
-    # EPEL support (for yasm)
-    yum -y install https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm
+    # EPEL support (for yasm) (localinstall to avoid error code if already installed - as for Rocm container)
+    yum -y localinstall https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm
    YASM=yasm
 elif [ "${AUDITWHEEL_ARCH}" == "aarch64" ] || [ "${AUDITWHEEL_ARCH}" == "ppc64le" ] || [ "${AUDITWHEEL_ARCH}" == "s390x" ]; then
    # Software collection (for devtoolset-$1)
--- a/tools/ci_build/github/linux/docker/scripts/install_deps.sh
+++ b/tools/ci_build/github/linux/docker/scripts/install_deps.sh
@ -5,8 +5,9 @@ SCRIPT_DIR="$( dirname "${BASH_SOURCE[0]}" )"
 INSTALL_DEPS_TRAINING=false
 INSTALL_DEPS_DISTRIBUTED_SETUP=false
 ORTMODULE_BUILD=false
+TARGET_ROCM=false

-while getopts p:d:tmu parameter_Option
+while getopts p:d:tmur parameter_Option
 do case "${parameter_Option}"
 in
 p) PYTHON_VER=${OPTARG};;
@ -14,6 +15,7 @@ d) DEVICE_TYPE=${OPTARG};;
 t) INSTALL_DEPS_TRAINING=true;;
 m) INSTALL_DEPS_DISTRIBUTED_SETUP=true;;
 u) ORTMODULE_BUILD=true;;
+r) TARGET_ROCM=true;;
 esac
 done

@ -121,9 +123,24 @@ if [ $DEVICE_TYPE = "gpu" ]; then
    if [[ $ORTMODULE_BUILD = false ]]; then
      ${PYTHON_EXE} -m pip install -r ${0/%install_deps.sh/training\/requirements.txt}
    else
-      ${PYTHON_EXE} -m pip install -r ${0/%install_deps.sh/training\/ortmodule\/stage1\/requirements.txt}
-      # Due to a [bug on DeepSpeed](https://github.com/microsoft/DeepSpeed/issues/663), we install it separately through ortmodule/stage2/requirements.txt
-      ${PYTHON_EXE} -m pip install -r ${0/%install_deps.sh/training\/ortmodule\/stage2\/requirements.txt}
+      if [[ $TARGET_ROCM = false ]]; then
+        ${PYTHON_EXE} -m pip install -r ${0/%install_deps.sh/training\/ortmodule\/stage1\/requirements.txt}
+        # Due to a [bug on DeepSpeed](https://github.com/microsoft/DeepSpeed/issues/663), we install it separately through ortmodule/stage2/requirements.txt
+        ${PYTHON_EXE} -m pip install -r ${0/%install_deps.sh/training\/ortmodule\/stage2\/requirements.txt}
+      else
+        ${PYTHON_EXE} -m pip install \
+          --pre -f https://download.pytorch.org/whl/nightly/rocm4.1/torch_nightly.html \
+          torch torchvision torchtext
+        ${PYTHON_EXE} -m pip install -r ${0/%install_deps.sh/training\/ortmodule\/stage1\/requirements-rocm.txt}
+        ${PYTHON_EXE} -m pip install fairscale
+	# remove triton requirement from getting triggered in requirements-sparse_attn.txt
+        git clone https://github.com/ROCmSoftwarePlatform/DeepSpeed
+        cd DeepSpeed &&\
+          rm requirements/requirements-sparse_attn.txt &&\
+          ${PYTHON_EXE} setup.py bdist_wheel &&\
+          ${PYTHON_EXE} -m pip install dist/deepspeed*.whl &&\
+	  cd ..
+      fi
    fi
  fi
  if [[ $INSTALL_DEPS_DISTRIBUTED_SETUP = true ]]; then
--- a/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements-rocm.txt
+++ b/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements-rocm.txt
@ -0,0 +1,8 @@
+# transformers requires sklearn
+pandas
+sklearn
+numpy==1.19.5
+transformers==v4.3.2
+tensorboard
+h5py
+wget