diff --git a/orttraining/tools/ci_test/results/ci-mi100.huggingface.bert-large-rocm5.5.json b/orttraining/tools/ci_test/results/ci-mi100.huggingface.bert-large-rocm5.6.json similarity index 100% rename from orttraining/tools/ci_test/results/ci-mi100.huggingface.bert-large-rocm5.5.json rename to orttraining/tools/ci_test/results/ci-mi100.huggingface.bert-large-rocm5.6.json diff --git a/orttraining/tools/ci_test/results/ci-mi200.huggingface.bert-large-rocm5.5.json b/orttraining/tools/ci_test/results/ci-mi200.huggingface.bert-large-rocm5.6.json similarity index 100% rename from orttraining/tools/ci_test/results/ci-mi200.huggingface.bert-large-rocm5.5.json rename to orttraining/tools/ci_test/results/ci-mi200.huggingface.bert-large-rocm5.6.json diff --git a/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml index f934eba5cb..fb355299d7 100644 --- a/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml @@ -9,7 +9,7 @@ variables: - name: render value: 109 - name: RocmVersion - value: 5.5 + value: 5.6 jobs: - job: Linux_Build @@ -31,10 +31,12 @@ jobs: clean: true submodules: recursive + - template: templates/get-docker-image-steps.yml parameters: Dockerfile: tools/ci_build/github/linux/docker/migraphx-ci-pipeline-env.Dockerfile Context: tools/ci_build/github/linux/docker + DockerBuildArgs: "--build-arg ROCM_VERSION=$(RocmVersion)" Repository: onnxruntimetrainingmigraphx-cibuild-rocm$(RocmVersion) - task: Cache@2 @@ -131,6 +133,7 @@ jobs: parameters: Dockerfile: tools/ci_build/github/linux/docker/migraphx-ci-pipeline-env.Dockerfile Context: tools/ci_build/github/linux/docker + DockerBuildArgs: "--build-arg ROCM_VERSION=$(RocmVersion)" Repository: onnxruntimetrainingmigraphx-cibuild-rocm$(RocmVersion) - task: CmdLine@2 diff --git a/tools/ci_build/github/azure-pipelines/orttraining-pai-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/orttraining-pai-ci-pipeline.yml index 1295f84142..0602828c11 100644 --- a/tools/ci_build/github/azure-pipelines/orttraining-pai-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/orttraining-pai-ci-pipeline.yml @@ -8,7 +8,7 @@ variables: - name: render value: 109 - name: RocmVersion - value: 5.5 + value: 5.6 - name: BuildConfig value: Release @@ -36,6 +36,7 @@ jobs: parameters: Dockerfile: tools/ci_build/github/pai/rocm-ci-pipeline-env.Dockerfile Context: tools/ci_build/github/linux/docker + DockerBuildArgs: "--build-arg ROCM_VERSION=$(RocmVersion)" Repository: onnxruntimetrainingrocm-cibuild-rocm$(RocmVersion)-build #- script: |- @@ -130,7 +131,7 @@ jobs: parameters: Dockerfile: tools/ci_build/github/pai/rocm-ci-pipeline-env.Dockerfile Context: tools/ci_build/github/linux/docker - DockerBuildArgs: "--build-arg BUILD_UID=$( id -u )" + DockerBuildArgs: "--build-arg BUILD_UID=$( id -u ) --build-arg ROCM_VERSION=$(RocmVersion)" Repository: onnxruntimetrainingrocm-cibuild-rocm$(RocmVersion)-test - task: Bash@3 diff --git a/tools/ci_build/github/linux/docker/migraphx-ci-pipeline-env.Dockerfile b/tools/ci_build/github/linux/docker/migraphx-ci-pipeline-env.Dockerfile index d3bca26875..d1b1df39b4 100644 --- a/tools/ci_build/github/linux/docker/migraphx-ci-pipeline-env.Dockerfile +++ b/tools/ci_build/github/linux/docker/migraphx-ci-pipeline-env.Dockerfile @@ -1,13 +1,42 @@ -FROM rocm/pytorch:rocm5.5_ubuntu20.04_py3.8_pytorch_1.13.1 +# Refer to https://github.com/RadeonOpenCompute/ROCm-docker/blob/master/dev/Dockerfile-ubuntu-22.04-complete +FROM ubuntu:22.04 -# MIGraphX version should be the same as ROCm version -ARG MIGRAPHX_VERSION=rocm-5.5.0 +ARG ROCM_VERSION=5.6 +ARG AMDGPU_VERSION=${ROCM_VERSION} +ARG APT_PREF='Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600' + +CMD ["/bin/bash"] + +RUN echo "$APT_PREF" > /etc/apt/preferences.d/rocm-pin-600 ENV DEBIAN_FRONTEND noninteractive -ENV MIGRAPHX_DISABLE_FAST_GELU=1 +RUN apt-get update && \ + apt-get install -y --no-install-recommends ca-certificates curl libnuma-dev gnupg && \ + curl -sL https://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - &&\ + printf "deb [arch=amd64] https://repo.radeon.com/rocm/apt/$ROCM_VERSION/ jammy main" | tee /etc/apt/sources.list.d/rocm.list && \ + printf "deb [arch=amd64] https://repo.radeon.com/amdgpu/$AMDGPU_VERSION/ubuntu jammy main" | tee /etc/apt/sources.list.d/amdgpu.list && \ + apt-get update && apt-get install -y --no-install-recommends \ + sudo \ + libelf1 \ + kmod \ + file \ + python3 \ + python3-pip \ + rocm-dev \ + rocm-libs \ + build-essential && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +RUN groupadd -g 109 render + +# Upgrade to meet security requirements RUN apt-get update -y && apt-get upgrade -y && apt-get autoremove -y && \ - apt-get install -y locales unzip && apt-get clean -y + apt-get install -y locales cifs-utils wget half libnuma-dev lsb-release && \ + apt-get clean -y + +ENV MIGRAPHX_DISABLE_FAST_GELU=1 RUN locale-gen en_US.UTF-8 RUN update-locale LANG=en_US.UTF-8 ENV LC_ALL C.UTF-8 @@ -15,28 +44,11 @@ ENV LANG C.UTF-8 WORKDIR /stage -ADD scripts /tmp/scripts -RUN /tmp/scripts/install_os_deps.sh - -# from rocm/pytorch's image, work around ucx's dlopen replacement conflicting with shared provider -RUN cd /opt/mpi_install/ucx/build &&\ - make clean &&\ - ../contrib/configure-release --prefix=/opt/ucx --without-rocm &&\ - make -j $(nproc) &&\ - make install - -RUN apt-get update &&\ - apt-get install -y half libnuma-dev - -# Install rbuild -RUN pip3 install https://github.com/RadeonOpenCompute/rbuild/archive/master.tar.gz numpy yapf==0.28.0 - -# Install MIGraphX from source -RUN mkdir -p /migraphx -RUN cd /migraphx && git clone --depth=1 --branch ${MIGRAPHX_VERSION} https://github.com/ROCmSoftwarePlatform/AMDMIGraphX src -RUN cd /migraphx && rbuild package --cxx /opt/rocm/llvm/bin/clang++ -d /migraphx/deps -B /migraphx/build -S /migraphx/src/ -DPYTHON_EXECUTABLE=/usr/bin/python3 -RUN dpkg -i /migraphx/build/*.deb -RUN rm -rf /migraphx +# Cmake +ENV CMAKE_VERSION=3.26.3 +RUN cd /usr/local && \ + wget -q https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-Linux-x86_64.tar.gz && \ + tar -zxf /usr/local/cmake-3.26.3-Linux-x86_64.tar.gz --strip=1 -C /usr # ccache RUN mkdir -p /tmp/ccache && \ @@ -44,3 +56,28 @@ RUN mkdir -p /tmp/ccache && \ wget -q -O - https://github.com/ccache/ccache/releases/download/v4.7.4/ccache-4.7.4-linux-x86_64.tar.xz | tar --strip 1 -J -xf - && \ cp /tmp/ccache/ccache /usr/bin && \ rm -rf /tmp/ccache + +# Install Conda +ENV PATH /opt/miniconda/bin:${PATH} +RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh --no-check-certificate && /bin/bash ~/miniconda.sh -b -p /opt/miniconda && \ + conda init bash && \ + conda config --set auto_activate_base false && \ + conda update --all && \ + rm ~/miniconda.sh && conda clean -ya + +# Conda base patch +RUN pip install cryptography==41.0.0 + +# Create migraphx-ci environment +ENV CONDA_ENVIRONMENT_PATH /opt/miniconda/envs/migraphx-ci +ENV CONDA_DEFAULT_ENV migraphx-ci +RUN conda create -y -n ${CONDA_DEFAULT_ENV} python=3.8 +ENV PATH ${CONDA_ENVIRONMENT_PATH}/bin:${PATH} + +# Enable migraphx-ci environment +SHELL ["conda", "run", "-n", "migraphx-ci", "/bin/bash", "-c"] + +# Install migraphx +RUN apt update && apt install -y migraphx + +RUN pip install numpy packaging diff --git a/tools/ci_build/github/pai/rocm-ci-pipeline-env.Dockerfile b/tools/ci_build/github/pai/rocm-ci-pipeline-env.Dockerfile index 67dd4d08c0..5343960c8d 100644 --- a/tools/ci_build/github/pai/rocm-ci-pipeline-env.Dockerfile +++ b/tools/ci_build/github/pai/rocm-ci-pipeline-env.Dockerfile @@ -1,8 +1,38 @@ -FROM rocm/cupy:rocm5.5.0_ubuntu20.04_py3.8_pytorch2.0.0_cupy13.0.0 +# Refer to https://github.com/RadeonOpenCompute/ROCm-docker/blob/master/dev/Dockerfile-ubuntu-22.04-complete +FROM ubuntu:22.04 +ARG ROCM_VERSION=5.6 +ARG AMDGPU_VERSION=${ROCM_VERSION} +ARG APT_PREF='Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600' + +CMD ["/bin/bash"] + +RUN echo "$APT_PREF" > /etc/apt/preferences.d/rocm-pin-600 + +ENV DEBIAN_FRONTEND noninteractive + +RUN apt-get update && \ + apt-get install -y --no-install-recommends ca-certificates curl libnuma-dev gnupg && \ + curl -sL https://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - &&\ + printf "deb [arch=amd64] https://repo.radeon.com/rocm/apt/$ROCM_VERSION/ jammy main" | tee /etc/apt/sources.list.d/rocm.list && \ + printf "deb [arch=amd64] https://repo.radeon.com/amdgpu/$AMDGPU_VERSION/ubuntu jammy main" | tee /etc/apt/sources.list.d/amdgpu.list && \ + apt-get update && apt-get install -y --no-install-recommends \ + sudo \ + libelf1 \ + kmod \ + file \ + python3 \ + python3-pip \ + rocm-dev \ + rocm-libs \ + build-essential && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +RUN groupadd -g 109 render RUN apt-get update -y && apt-get upgrade -y && apt-get autoremove -y libprotobuf\* protobuf-compiler\* && \ - rm -f /usr/local/bin/protoc && apt-get install -y locales unzip && apt-get clean -y + rm -f /usr/local/bin/protoc && apt-get install -y locales unzip wget git && apt-get clean -y RUN locale-gen en_US.UTF-8 RUN update-locale LANG=en_US.UTF-8 ENV LC_ALL C.UTF-8 @@ -10,13 +40,6 @@ ENV LANG C.UTF-8 WORKDIR /stage -# from rocm/pytorch's image, work around ucx's dlopen replacement conflicting with shared provider -RUN cd /opt/mpi_install/ucx/build &&\ - make clean &&\ - ../contrib/configure-release --prefix=/opt/ucx --without-rocm &&\ - make -j $(nproc) &&\ - make install - # CMake ENV CMAKE_VERSION=3.26.3 RUN cd /usr/local && \ @@ -30,35 +53,83 @@ RUN mkdir -p /tmp/ccache && \ cp /tmp/ccache/ccache /usr/bin && \ rm -rf /tmp/ccache -RUN apt-get update && apt-get install -y cifs-utils +# Install Conda +ENV PATH /opt/miniconda/bin:${PATH} +RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh --no-check-certificate && /bin/bash ~/miniconda.sh -b -p /opt/miniconda && \ + conda init bash && \ + conda config --set auto_activate_base false && \ + conda update --all && \ + rm ~/miniconda.sh && conda clean -ya +# Create rocm-ci environment +ENV CONDA_ENVIRONMENT_PATH /opt/miniconda/envs/rocm-ci +ENV CONDA_DEFAULT_ENV rocm-ci +RUN conda create -y -n ${CONDA_DEFAULT_ENV} python=3.8 +ENV PATH ${CONDA_ENVIRONMENT_PATH}/bin:${PATH} + +# Conda base patch +RUN pip install cryptography==41.0.0 + +# Enable rocm-ci environment +SHELL ["conda", "run", "-n", "rocm-ci", "/bin/bash", "-c"] + +# ln -sf is needed to make sure that version `GLIBCXX_3.4.30' is found +RUN ln -sf /usr/lib/x86_64-linux-gnu/libstdc++.so.6 ${CONDA_ENVIRONMENT_PATH}/bin/../lib/libstdc++.so.6 + +# Install Pytorch +RUN pip install install torch==2.0.1 torchvision==0.15.2 -f https://repo.radeon.com/rocm/manylinux/rocm-rel-${ROCM_VERSION}/ && \ + pip install torch-ort --no-dependencies + + +##### Install Cupy to decrease CPU utilization +# Install non dev openmpi +RUN wget https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-4.1.5.tar.bz2 && \ + tar -jxf openmpi-4.1.5.tar.bz2 && \ + cd openmpi-4.1.5 && \ + ./configure --prefix=/opt/ompi && \ + make -j4 all && \ + make install && \ + cd ../ && \ + rm -r openmpi-4.1.5 && \ + rm openmpi-4.1.5.tar.bz2 + +# Install CuPy, No stable version is available +RUN git clone https://github.com/ROCmSoftwarePlatform/cupy && cd cupy && \ + git checkout fc251a808037f8a2270860c2a23a683bfc0de43e && \ + export CUPY_INSTALL_USE_HIP=1 && \ + export ROCM_HOME=/opt/rocm && \ + export HCC_AMDGPU_TARGET=gfx906,gfx908,gfx90a && \ + git submodule update --init && \ + pip install -e . --no-cache-dir -vvvv + +##### Install transformers to run tests # rocm-ci branch contains instrumentation needed for loss curves and perf RUN git clone https://github.com/microsoft/huggingface-transformers.git &&\ - cd huggingface-transformers &&\ - git checkout rocm-ci &&\ - pip install -e . + cd huggingface-transformers &&\ + git checkout rocm-ci &&\ + pip install -e . RUN pip install \ - numpy==1.24.1 \ - onnx \ - cerberus \ - sympy \ - h5py \ - datasets==1.9.0 \ - requests \ - sacrebleu==1.5.1 \ - sacremoses \ - scipy==1.10.0 \ - scikit-learn \ - tokenizers \ - sentencepiece \ - dill==0.3.4 \ - wget \ - pytorch_lightning==1.6.0 \ - pytest-xdist \ - pytest-rerunfailures + flatbuffers==2.0 \ + numpy==1.24.1 \ + onnx \ + cerberus \ + sympy \ + h5py \ + datasets==1.9.0 \ + requests \ + sacrebleu==1.5.1 \ + sacremoses \ + scipy==1.10.0 \ + scikit-learn \ + tokenizers \ + sentencepiece \ + wget \ + dill==0.3.4 \ + pytorch_lightning==1.6.0 \ + pytest-xdist \ + pytest-rerunfailures -RUN pip install torch-ort --no-dependencies ENV ORTMODULE_ONNX_OPSET_VERSION=15 ARG BUILD_UID=1001