From da42796f1fc37746bf70bab2b44cbbca96fea07e Mon Sep 17 00:00:00 2001 From: Tianlei Wu Date: Thu, 30 Jan 2025 05:35:30 +0000 Subject: [PATCH] rocm 6.3.2, python 3.12 and ubuntu 24.04 for rocm ci --- .../linux-rocm-ci-pipeline.yml | 25 +++-- .../linux/docker/Dockerfile.ubuntu_rocm | 92 ++++++++++++++++ .../docker/rocm-ci-pipeline-env.Dockerfile | 100 ------------------ 3 files changed, 106 insertions(+), 111 deletions(-) create mode 100644 tools/ci_build/github/linux/docker/Dockerfile.ubuntu_rocm delete mode 100644 tools/ci_build/github/linux/docker/rocm-ci-pipeline-env.Dockerfile diff --git a/tools/ci_build/github/azure-pipelines/linux-rocm-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-rocm-ci-pipeline.yml index 453c5885f3..60cd9cf1d8 100644 --- a/tools/ci_build/github/azure-pipelines/linux-rocm-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/linux-rocm-ci-pipeline.yml @@ -37,7 +37,7 @@ variables: - name: render value: 109 - name: RocmVersion - value: 6.1.3 + value: 6.3.2 jobs: - job: Linux_Build @@ -59,10 +59,10 @@ jobs: - template: templates/get-docker-image-steps.yml parameters: - Dockerfile: tools/ci_build/github/linux/docker/rocm-ci-pipeline-env.Dockerfile + Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.ubuntu_rocm Context: tools/ci_build/github/linux/docker DockerBuildArgs: "--build-arg ROCM_VERSION=$(RocmVersion)" - Repository: onnxruntimerocm-cibuild-rocm$(RocmVersion) + Repository: onnxruntime-rocm-cibuild-rocm$(RocmVersion) - task: Cache@2 inputs: @@ -90,11 +90,12 @@ jobs: --volume $(CCACHE_DIR):/cache \ -e CCACHE_DIR=/cache \ --workdir /onnxruntime_src \ - onnxruntimerocm-cibuild-rocm$(RocmVersion) \ + onnxruntime-rocm-cibuild-rocm$(RocmVersion) \ /bin/bash -c " set -ex; \ - env; \ ccache -s; \ + source /ort/env/bin/activate; \ + env; \ python tools/ci_build/build.py \ --config Release \ --cmake_extra_defines \ @@ -159,10 +160,10 @@ jobs: - template: templates/get-docker-image-steps.yml parameters: - Dockerfile: tools/ci_build/github/linux/docker/rocm-ci-pipeline-env.Dockerfile + Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.ubuntu_rocm Context: tools/ci_build/github/linux/docker DockerBuildArgs: "--build-arg ROCM_VERSION=$(RocmVersion)" - Repository: onnxruntimerocm-cibuild-rocm$(RocmVersion) + Repository: onnxruntime-rocm-cibuild-rocm$(RocmVersion) - task: CmdLine@2 inputs: @@ -179,10 +180,11 @@ jobs: --volume $(Build.BinariesDirectory):/build \ --volume /data/models:/build/models:ro \ --workdir /build/Release \ - onnxruntimerocm-cibuild-rocm$(RocmVersion) \ + onnxruntime-rocm-cibuild-rocm$(RocmVersion) \ /bin/bash -c " set -ex; \ xargs -a /build/Release/perms.txt chmod a+x; \ + source /ort/env/bin/activate; \ python /onnxruntime_src/tools/ci_build/build.py \ --config Release \ --cmake_extra_defines \ @@ -223,13 +225,14 @@ jobs: -e MKL_NUM_THREADS=1 \ -e KERNEL_EXPLORER_BUILD_DIR=/build/Release \ -e KERNEL_EXPLORER_BATCHED_GEMM_MAX_BATCH_SIZE=8 \ - -e KERNEL_EXPLORER_TEST_USE_CUPY=1 \ + -e KERNEL_EXPLORER_TEST_USE_CUPY=0 \ -e CUPY_CACHE_DIR=/build/Release \ - onnxruntimerocm-cibuild-rocm$(RocmVersion) \ + --workdir /ort \ + onnxruntime-rocm-cibuild-rocm$(RocmVersion) \ /bin/bash -c " set -ex; \ + source /ort/env/bin/activate; \ python --version; \ - ls /opt/miniconda/envs/rocm-ci/lib/; \ pytest /onnxruntime_src/onnxruntime/python/tools/kernel_explorer/ -n 4 --reruns 1 --durations=100" workingDirectory: $(Build.SourcesDirectory) displayName: 'Run kernel explorer tests' diff --git a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_rocm b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_rocm new file mode 100644 index 0000000000..b1941ceceb --- /dev/null +++ b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_rocm @@ -0,0 +1,92 @@ +# Refer to https://github.com/RadeonOpenCompute/ROCm-docker/blob/master/dev/Dockerfile-ubuntu-24.04-complete +FROM ubuntu:24.04 + +ARG ROCM_VERSION=6.3.2 +ARG PYTHON_VERSION=3.12 +ARG CMAKE_VERSION=3.30.1 +ARG CCACHE_VERSION=4.7.4 +ARG USE_MIGRAPHX=0 + +LABEL ROCM_VERSION="${ROCM_VERSION}" +LABEL USE_MIGRAPHX="${USE_MIGRAPHX}" +LABEL PYTHON_VERSION="${PYTHON_VERSION}" + +# Set environment variables +ENV DEBIAN_FRONTEND=noninteractive \ + LC_ALL=C.UTF-8 \ + LANG=C.UTF-8 \ + LD_LIBRARY_PATH=/opt/rocm/lib:/usr/lib/x86_64-linux-gnu \ + PATH=/opt/rocm/bin:/usr/bin:/bin:/usr/sbin:/usr/local/bin + +# Set default shell +SHELL ["/bin/bash", "-c"] + +# Add ROCm package pinning +RUN echo "Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600" > /etc/apt/preferences.d/rocm-pin-600 + +# Install dependencies and ROCm +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + ca-certificates \ + curl \ + libnuma-dev \ + gnupg && \ + curl -sL https://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - && \ + printf "deb [arch=amd64] https://repo.radeon.com/rocm/apt/${ROCM_VERSION}/ noble main" | tee --append /etc/apt/sources.list.d/rocm.list && \ + printf "deb [arch=amd64] https://repo.radeon.com/amdgpu/${ROCM_VERSION}/ubuntu noble main" | tee /etc/apt/sources.list.d/amdgpu.list && \ + migraphx=$( [ "$USE_MIGRAPHX" -eq 1 ] && echo "migraphx" || echo "" ) && \ + apt-get update && apt-get install -y --no-install-recommends \ + sudo \ + libelf1 \ + kmod \ + file \ + git \ + python3-pip \ + python${PYTHON_VERSION}-dev \ + python${PYTHON_VERSION}-venv \ + hipcc=1.1.1.60302-66~24.04 \ + rocm-cmake=0.14.0.60302-66~24.04 \ + rocm-utils=6.3.2.60302-66~24.04 \ + rocminfo=1.0.0.60302-66~24.04 \ + rocm-dev \ + rocm-libs \ + $migraphx \ + build-essential \ + locales \ + cifs-utils \ + wget \ + half \ + libnuma-dev \ + lsb-release && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/* && \ + locale-gen en_US.UTF-8 && \ + update-locale LANG=en_US.UTF-8 + +# Create render group +RUN groupadd -g 109 render + +# Install CMake +WORKDIR /usr/local +RUN wget -q https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-Linux-x86_64.tar.gz && \ + tar -zxf cmake-${CMAKE_VERSION}-Linux-x86_64.tar.gz --strip=1 -C /usr && \ + rm -f cmake-${CMAKE_VERSION}-Linux-x86_64.tar.gz + +# Install ccache +RUN mkdir -p /tmp/ccache && \ + cd /tmp/ccache && \ + wget -q -O - https://github.com/ccache/ccache/releases/download/v${CCACHE_VERSION}/ccache-${CCACHE_VERSION}-linux-x86_64.tar.xz | tar --strip 1 -J -xf - && \ + cp /tmp/ccache/ccache /usr/bin/ && \ + rm -rf /tmp/ccache + +# Set up virtual environment for Python and install dependencies +WORKDIR /ort +COPY scripts/requirements.txt /ort/ +RUN python3 -m venv /ort/env && \ + source /ort/env/bin/activate && \ + pip install --upgrade pip setuptools wheel && \ + pip install -r /ort/requirements.txt && \ + pip install ml_dtypes pytest-xdist pytest-rerunfailures scipy + +# Default command +CMD ["/bin/bash"] diff --git a/tools/ci_build/github/linux/docker/rocm-ci-pipeline-env.Dockerfile b/tools/ci_build/github/linux/docker/rocm-ci-pipeline-env.Dockerfile deleted file mode 100644 index f74c5c7b02..0000000000 --- a/tools/ci_build/github/linux/docker/rocm-ci-pipeline-env.Dockerfile +++ /dev/null @@ -1,100 +0,0 @@ -# Refer to https://github.com/RadeonOpenCompute/ROCm-docker/blob/master/dev/Dockerfile-ubuntu-22.04-complete -FROM ubuntu:22.04 - -ARG ROCM_VERSION=6.1.3 -ARG AMDGPU_VERSION=${ROCM_VERSION} -ARG APT_PREF='Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600' - -CMD ["/bin/bash"] - -RUN echo "$APT_PREF" > /etc/apt/preferences.d/rocm-pin-600 - -ENV DEBIAN_FRONTEND noninteractive - -RUN apt-get update && \ - apt-get install -y --no-install-recommends ca-certificates curl libnuma-dev gnupg && \ - curl -sL https://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - &&\ - printf "deb [arch=amd64] https://repo.radeon.com/rocm/apt/$ROCM_VERSION/ jammy main" | tee /etc/apt/sources.list.d/rocm.list && \ - printf "deb [arch=amd64] https://repo.radeon.com/amdgpu/$AMDGPU_VERSION/ubuntu jammy main" | tee /etc/apt/sources.list.d/amdgpu.list && \ - apt-get update && apt-get install -y --no-install-recommends \ - sudo \ - libelf1 \ - kmod \ - file \ - python3 \ - python3-pip \ - rocm-dev \ - rocm-libs \ - build-essential && \ - apt-get clean && \ - rm -rf /var/lib/apt/lists/* - -RUN groupadd -g 109 render - -# Upgrade to meet security requirements -RUN apt-get update -y && apt-get upgrade -y && apt-get autoremove -y && \ - apt-get install -y locales cifs-utils wget half libnuma-dev lsb-release && \ - apt-get clean -y - -RUN locale-gen en_US.UTF-8 -RUN update-locale LANG=en_US.UTF-8 -ENV LC_ALL C.UTF-8 -ENV LANG C.UTF-8 - -WORKDIR /stage - -# Cmake -ENV CMAKE_VERSION=3.30.1 -RUN cd /usr/local && \ - wget -q https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-Linux-x86_64.tar.gz && \ - tar -zxf /usr/local/cmake-3.30.1-Linux-x86_64.tar.gz --strip=1 -C /usr - -# ccache -RUN mkdir -p /tmp/ccache && \ - cd /tmp/ccache && \ - wget -q -O - https://github.com/ccache/ccache/releases/download/v4.7.4/ccache-4.7.4-linux-x86_64.tar.xz | tar --strip 1 -J -xf - && \ - cp /tmp/ccache/ccache /usr/bin && \ - rm -rf /tmp/ccache - -# Install Conda -ENV PATH /opt/miniconda/bin:${PATH} -RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh --no-check-certificate && /bin/bash ~/miniconda.sh -b -p /opt/miniconda && \ - conda init bash && \ - conda config --set auto_activate_base false && \ - conda update --all && \ - rm ~/miniconda.sh && conda clean -ya - -# Create rocm-ci environment -ENV CONDA_ENVIRONMENT_PATH /opt/miniconda/envs/rocm-ci -ENV CONDA_DEFAULT_ENV rocm-ci -RUN conda create -y -n ${CONDA_DEFAULT_ENV} python=3.10 -ENV PATH ${CONDA_ENVIRONMENT_PATH}/bin:${PATH} - -# Enable rocm-ci environment -SHELL ["conda", "run", "-n", "rocm-ci", "/bin/bash", "-c"] - -# Some DLLs in the conda environment have conflict with the one installed in Ubuntu system. -# For example, the GCC version in the conda environment is 12.x, while the one in the Ubuntu 22.04 is 11.x. -# ln -sf to make sure we always use libstdc++.so.6 and libgcc_s.so.1 in the system. -RUN ln -sf /usr/lib/x86_64-linux-gnu/libstdc++.so.6 ${CONDA_ENVIRONMENT_PATH}/bin/../lib/libstdc++.so.6 -RUN ln -sf /usr/lib/x86_64-linux-gnu/libgcc_s.so.1 ${CONDA_ENVIRONMENT_PATH}/bin/../lib/libgcc_s.so.1 - -RUN pip install packaging \ - ml_dtypes==0.5.0 \ - pytest==7.4.4 \ - pytest-xdist \ - pytest-rerunfailures \ - scipy==1.14.1 \ - numpy==1.26.4 - -RUN apt install -y git - -# Install Cupy to decrease CPU utilization -# Note that the version of Cupy requires numpy < 1.27 -RUN git clone https://github.com/ROCm/cupy && cd cupy && \ - git checkout 432a8683351d681e00903640489cb2f4055d2e09 && \ - export CUPY_INSTALL_USE_HIP=1 && \ - export ROCM_HOME=/opt/rocm && \ - export HCC_AMDGPU_TARGET=gfx906,gfx908,gfx90a && \ - git submodule update --init && \ - pip install -e . --no-cache-dir -vvvv