[ROCm] Python 3.10 in ROCm CI, and ROCm 6.2.3 in MigraphX CI (#22527)

### Description
Upgrade python from 3.9 to 3.10 in ROCm and MigraphX docker files and CI
pipelines. Upgrade ROCm version to 6.2.3 in most places except ROCm CI,
see comment below.

Some improvements/upgrades on ROCm/Migraphx docker or pipeline:
* rocm 6.0/6.1.3 => 6.2.3
* python 3.9 => 3.10
* Ubuntu 20.04 => 22.04
* Also upgrade ml_dtypes, numpy and scipy packages.
* Fix message "ROCm version from ..." with correct file path in
CMakeList.txt
* Exclude some NHWC tests since ROCm EP lacks support for NHWC
convolution.

#### ROCm CI Pipeline:
ROCm 6.1.3 is kept in the pipeline for now.
- Failed after upgrading to ROCm 6.2.3: `HIPBLAS_STATUS_INVALID_VALUE ;
GPU=0 ; hostname=76123b390aed ;
file=/onnxruntime_src/onnxruntime/core/providers/rocm/rocm_execution_provider.cc
; line=170 ; expr=hipblasSetStream(hipblas_handle_, stream);` . It need
further investigation.
- cupy issues:
(1) It currently supports numpy < 1.27, might not work with numpy 2.x.
So we locked numpy==1.26.4 for now.
(2) cupy support of ROCm 6.2 is still in progress:
https://github.com/cupy/cupy/issues/8606.

Note that miniconda issues: its libstdc++.so.6 and libgcc_s.so.1 might
have conflict with the system ones. So we created links to use the
system ones.

#### MigraphX CI pipeline

MigraphX CI does not use cupy, and we are able to use ROCm 6.2.3 and
numpy 2.x in the pipeline.

#### Other attempts

Other things that I've tried which might help in the future: 

Attempt to use a single docker file for both ROCm and Migraphx:
https://github.com/microsoft/onnxruntime/pull/22478

Upgrade to ubuntu 24.04 and python 3.12, and use venv like
[this](27903e7ff1/tools/ci_build/github/linux/docker/rocm-ci-pipeline-env.Dockerfile).

### Motivation and Context
In 1.20 release, ROCm nuget packaging pipeline will use 6.2:
https://github.com/microsoft/onnxruntime/pull/22461.
This upgrades rocm to 6.2.3 in CI pipelines to be consistent.
This commit is contained in:
Tianlei Wu 2024-10-25 11:47:16 -07:00 committed by GitHub
parent 28efacfd5a
commit b4afc6266f
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
11 changed files with 70 additions and 61 deletions

View file

@ -291,12 +291,50 @@ if (onnxruntime_USE_ROCM)
message(FATAL_ERROR "ROCM does not support build with CUDA!")
endif()
# replicate strategy used by pytorch to get ROCM_VERSION
# https://github.com/pytorch/pytorch/blob/5c5b71b6eebae76d744261715231093e62f0d090/cmake/public/LoadHIP.cmake
# with modification
if (EXISTS "${onnxruntime_ROCM_HOME}/.info/version")
message("\n***** ROCm version from ${onnxruntime_ROCM_HOME}/.info/version ****\n")
file(READ "${onnxruntime_ROCM_HOME}/.info/version" ROCM_VERSION_DEV_RAW)
string(REGEX MATCH "^([0-9]+)\.([0-9]+)\.([0-9]+)-.*$" ROCM_VERSION_MATCH ${ROCM_VERSION_DEV_RAW})
elseif (EXISTS "${onnxruntime_ROCM_HOME}/include/rocm_version.h")
message("\n***** ROCm version from ${onnxruntime_ROCM_HOME}/include/rocm_version.h ****\n")
file(READ "${onnxruntime_ROCM_HOME}/include/rocm_version.h" ROCM_VERSION_H_RAW)
string(REGEX MATCH "\"([0-9]+)\.([0-9]+)\.([0-9]+).*\"" ROCM_VERSION_MATCH ${ROCM_VERSION_H_RAW})
elseif (EXISTS "${onnxruntime_ROCM_HOME}/include/rocm-core/rocm_version.h")
message("\n***** ROCm version from ${onnxruntime_ROCM_HOME}/include/rocm-core/rocm_version.h ****\n")
file(READ "${onnxruntime_ROCM_HOME}/include/rocm-core/rocm_version.h" ROCM_VERSION_H_RAW)
string(REGEX MATCH "\"([0-9]+)\.([0-9]+)\.([0-9]+).*\"" ROCM_VERSION_MATCH ${ROCM_VERSION_H_RAW})
endif()
if (ROCM_VERSION_MATCH)
set(ROCM_VERSION_DEV_MAJOR ${CMAKE_MATCH_1})
set(ROCM_VERSION_DEV_MINOR ${CMAKE_MATCH_2})
set(ROCM_VERSION_DEV_PATCH ${CMAKE_MATCH_3})
set(ROCM_VERSION_DEV "${ROCM_VERSION_DEV_MAJOR}.${ROCM_VERSION_DEV_MINOR}.${ROCM_VERSION_DEV_PATCH}")
math(EXPR ROCM_VERSION_DEV_INT "(${ROCM_VERSION_DEV_MAJOR}*10000) + (${ROCM_VERSION_DEV_MINOR}*100) + ${ROCM_VERSION_DEV_PATCH}")
message("ROCM_VERSION_DEV: ${ROCM_VERSION_DEV}")
message("ROCM_VERSION_DEV_MAJOR: ${ROCM_VERSION_DEV_MAJOR}")
message("ROCM_VERSION_DEV_MINOR: ${ROCM_VERSION_DEV_MINOR}")
message("ROCM_VERSION_DEV_PATCH: ${ROCM_VERSION_DEV_PATCH}")
message("ROCM_VERSION_DEV_INT: ${ROCM_VERSION_DEV_INT}")
else()
message(FATAL_ERROR "Cannot determine ROCm version string")
endif()
if (NOT CMAKE_HIP_COMPILER)
set(CMAKE_HIP_COMPILER "${onnxruntime_ROCM_HOME}/llvm/bin/clang++")
endif()
if (NOT CMAKE_HIP_ARCHITECTURES)
set(CMAKE_HIP_ARCHITECTURES "gfx908;gfx90a;gfx1030;gfx1100;gfx1101;gfx940;gfx941;gfx942;gfx1200;gfx1201")
if (ROCM_VERSION_DEV VERSION_LESS "6.2")
message(FATAL_ERROR "CMAKE_HIP_ARCHITECTURES is not set when ROCm version < 6.2")
else()
set(CMAKE_HIP_ARCHITECTURES "gfx908;gfx90a;gfx1030;gfx1100;gfx1101;gfx940;gfx941;gfx942;gfx1200;gfx1201")
endif()
endif()
file(GLOB rocm_cmake_components ${onnxruntime_ROCM_HOME}/lib/cmake/*)
@ -328,35 +366,6 @@ if (onnxruntime_USE_ROCM)
set(onnxruntime_HIPIFY_PERL ${HIPIFY_PERL_PATH}/hipify-perl)
endif()
# replicate strategy used by pytorch to get ROCM_VERSION
# https://github.com/pytorch/pytorch/blob/5c5b71b6eebae76d744261715231093e62f0d090/cmake/public/LoadHIP.cmake
# with modification
if (EXISTS "${onnxruntime_ROCM_HOME}/.info/version")
file(READ "${onnxruntime_ROCM_HOME}/.info/version" ROCM_VERSION_DEV_RAW)
string(REGEX MATCH "^([0-9]+)\.([0-9]+)\.([0-9]+)-.*$" ROCM_VERSION_MATCH ${ROCM_VERSION_DEV_RAW})
elseif (EXISTS "${onnxruntime_ROCM_HOME}/include/rocm_version.h")
file(READ "${onnxruntime_ROCM_HOME}/include/rocm_version.h" ROCM_VERSION_H_RAW)
string(REGEX MATCH "\"([0-9]+)\.([0-9]+)\.([0-9]+).*\"" ROCM_VERSION_MATCH ${ROCM_VERSION_H_RAW})
elseif (EXISTS "${onnxruntime_ROCM_HOME}/include/rocm-core/rocm_version.h")
file(READ "${onnxruntime_ROCM_HOME}/include/rocm-core/rocm_version.h" ROCM_VERSION_H_RAW)
string(REGEX MATCH "\"([0-9]+)\.([0-9]+)\.([0-9]+).*\"" ROCM_VERSION_MATCH ${ROCM_VERSION_H_RAW})
endif()
if (ROCM_VERSION_MATCH)
set(ROCM_VERSION_DEV_MAJOR ${CMAKE_MATCH_1})
set(ROCM_VERSION_DEV_MINOR ${CMAKE_MATCH_2})
set(ROCM_VERSION_DEV_PATCH ${CMAKE_MATCH_3})
set(ROCM_VERSION_DEV "${ROCM_VERSION_DEV_MAJOR}.${ROCM_VERSION_DEV_MINOR}.${ROCM_VERSION_DEV_PATCH}")
math(EXPR ROCM_VERSION_DEV_INT "(${ROCM_VERSION_DEV_MAJOR}*10000) + (${ROCM_VERSION_DEV_MINOR}*100) + ${ROCM_VERSION_DEV_PATCH}")
else()
message(FATAL_ERROR "Cannot determine ROCm version string")
endif()
message("\n***** ROCm version from ${onnxruntime_ROCM_HOME}/.info/version ****\n")
message("ROCM_VERSION_DEV: ${ROCM_VERSION_DEV}")
message("ROCM_VERSION_DEV_MAJOR: ${ROCM_VERSION_DEV_MAJOR}")
message("ROCM_VERSION_DEV_MINOR: ${ROCM_VERSION_DEV_MINOR}")
message("ROCM_VERSION_DEV_PATCH: ${ROCM_VERSION_DEV_PATCH}")
message("ROCM_VERSION_DEV_INT: ${ROCM_VERSION_DEV_INT}")
message("\n***** HIP LANGUAGE CONFIG INFO ****\n")
message("CMAKE_HIP_COMPILER: ${CMAKE_HIP_COMPILER}")
message("CMAKE_HIP_ARCHITECTURES: ${CMAKE_HIP_ARCHITECTURES}")

View file

@ -5,7 +5,7 @@
# Dockerfile to run ONNXRuntime with MIGraphX integration
#--------------------------------------------------------------------------
FROM rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1
FROM rocm/pytorch:rocm6.2.3_ubuntu22.04_py3.10_pytorch_release_2.3.0
ARG ONNXRUNTIME_REPO=https://github.com/Microsoft/onnxruntime
ARG ONNXRUNTIME_BRANCH=main

View file

@ -5,7 +5,7 @@
# Dockerfile to run ONNXRuntime with ROCm integration
#--------------------------------------------------------------------------
FROM rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1
FROM rocm/pytorch:rocm6.2.3_ubuntu22.04_py3.10_pytorch_release_2.3.0
ARG ONNXRUNTIME_REPO=https://github.com/Microsoft/onnxruntime
ARG ONNXRUNTIME_BRANCH=main

View file

@ -292,7 +292,7 @@ Nothing else from ONNX Runtime source tree will be copied/installed to the image
Note: When running the container you built in Docker, please either use 'nvidia-docker' command instead of 'docker', or use Docker command-line options to make sure NVIDIA runtime will be used and appropriate files mounted from host. Otherwise, CUDA libraries won't be found. You can also [set NVIDIA runtime as default in Docker](https://github.com/dusty-nv/jetson-containers#docker-default-runtime).
## MIGraphX
**Ubuntu 20.04, ROCm6.0, MIGraphX**
**Ubuntu 22.04, ROCm6.2.3, MIGraphX**
1. Build the docker image from the Dockerfile in this repository.
```
@ -306,7 +306,7 @@ Note: When running the container you built in Docker, please either use 'nvidia-
```
## ROCm
**Ubuntu 20.04, ROCm6.0**
**Ubuntu 22.04, ROCm6.2.3**
1. Build the docker image from the Dockerfile in this repository.
```

View file

@ -159,7 +159,7 @@ TEST(InternalTestingEP, PreventSaveOfModelWithCompiledOps) {
// the internal NHWC operators are only included as part of contrib ops currently. as the EP requests the NHWC
// version of the ONNX operator when matching a static kernel, those are required.
#if !defined(DISABLE_CONTRIB_OPS)
#if !defined(DISABLE_CONTRIB_OPS) && !defined(USE_ROCM)
TEST(InternalTestingEP, TestMixOfStaticAndCompiledKernels) {
const ORTCHAR_T* ort_model_path = ORT_MODEL_FOLDER "transform/fusion/conv_relu_opset12.onnx";
@ -256,10 +256,6 @@ TEST(InternalTestingEP, TestNhwcConversionOfStaticKernels) {
run_test(ort_model_path);
}
// This test can be deprecated now as the code logic has been changed so the model is not applicable
// TEST(InternalTestingEP, TestRegisterAllocatorHandlesUsageInMultipleSessions) {
//}
// make sure allocators returned by SessionState::GetAllocator are valid when IExecutionProvider::ReplaceAllocator
// is used. if something is off InferenceSession::Initialize will fail.
TEST(InternalTestingEP, TestReplaceAllocatorDoesntBreakDueToLocalAllocatorStorage) {

View file

@ -37,9 +37,7 @@ variables:
- name: render
value: 109
- name: RocmVersion
value: 6.1
- name: RocmVersionPatchSuffix
value: ".3"
value: 6.2.3
jobs:
- job: Linux_Build
@ -66,7 +64,7 @@ jobs:
parameters:
Dockerfile: tools/ci_build/github/linux/docker/migraphx-ci-pipeline-env.Dockerfile
Context: tools/ci_build/github/linux/docker
DockerBuildArgs: "--build-arg ROCM_VERSION=$(RocmVersion)$(RocmVersionPatchSuffix)"
DockerBuildArgs: "--build-arg ROCM_VERSION=$(RocmVersion)"
Repository: onnxruntimetrainingmigraphx-cibuild-rocm$(RocmVersion)
- task: Cache@2
@ -165,7 +163,7 @@ jobs:
parameters:
Dockerfile: tools/ci_build/github/linux/docker/migraphx-ci-pipeline-env.Dockerfile
Context: tools/ci_build/github/linux/docker
DockerBuildArgs: "--build-arg ROCM_VERSION=$(RocmVersion)$(RocmVersionPatchSuffix)"
DockerBuildArgs: "--build-arg ROCM_VERSION=$(RocmVersion)"
Repository: onnxruntimetrainingmigraphx-cibuild-rocm$(RocmVersion)
- task: CmdLine@2

View file

@ -37,9 +37,7 @@ variables:
- name: render
value: 109
- name: RocmVersion
value: 6.1
- name: RocmVersionPatchSuffix
value: ".3"
value: 6.1.3
jobs:
- job: Linux_Build
@ -66,7 +64,7 @@ jobs:
parameters:
Dockerfile: tools/ci_build/github/linux/docker/rocm-ci-pipeline-env.Dockerfile
Context: tools/ci_build/github/linux/docker
DockerBuildArgs: "--build-arg ROCM_VERSION=$(RocmVersion)$(RocmVersionPatchSuffix)"
DockerBuildArgs: "--build-arg ROCM_VERSION=$(RocmVersion)"
Repository: onnxruntimerocm-cibuild-rocm$(RocmVersion)
- task: Cache@2
@ -166,7 +164,7 @@ jobs:
parameters:
Dockerfile: tools/ci_build/github/linux/docker/rocm-ci-pipeline-env.Dockerfile
Context: tools/ci_build/github/linux/docker
DockerBuildArgs: "--build-arg ROCM_VERSION=$(RocmVersion)$(RocmVersionPatchSuffix)"
DockerBuildArgs: "--build-arg ROCM_VERSION=$(RocmVersion)"
Repository: onnxruntimerocm-cibuild-rocm$(RocmVersion)
- task: CmdLine@2
@ -231,7 +229,11 @@ jobs:
-e KERNEL_EXPLORER_TEST_USE_CUPY=1 \
-e CUPY_CACHE_DIR=/build/Release \
onnxruntimerocm-cibuild-rocm$(RocmVersion) \
pytest /onnxruntime_src/onnxruntime/python/tools/kernel_explorer/ -n 4 --reruns 1 --durations=100
/bin/bash -c "
set -ex; \
python --version; \
ls /opt/miniconda/envs/rocm-ci/lib/; \
pytest /onnxruntime_src/onnxruntime/python/tools/kernel_explorer/ -n 4 --reruns 1 --durations=100"
workingDirectory: $(Build.SourcesDirectory)
displayName: 'Run kernel explorer tests'
condition: succeededOrFailed()

View file

@ -6,7 +6,7 @@ ARG LD_LIBRARY_PATH_ARG=${DEVTOOLSET_ROOTPATH}/usr/lib64:${DEVTOOLSET_ROOTPATH}/
ARG PREPEND_PATH=${DEVTOOLSET_ROOTPATH}/usr/bin:
FROM $BASEIMAGE AS base_image
ARG ROCM_VERSION=5.5
ARG ROCM_VERSION=6.2.3
#Add our own dependencies
ADD scripts /tmp/scripts

View file

@ -1,7 +1,7 @@
# Refer to https://github.com/RadeonOpenCompute/ROCm-docker/blob/master/dev/Dockerfile-ubuntu-22.04-complete
FROM ubuntu:22.04
ARG ROCM_VERSION=6.0
ARG ROCM_VERSION=6.2.3
ARG AMDGPU_VERSION=${ROCM_VERSION}
ARG APT_PREF='Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600'
@ -68,7 +68,7 @@ RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86
# Create migraphx-ci environment
ENV CONDA_ENVIRONMENT_PATH /opt/miniconda/envs/migraphx-ci
ENV CONDA_DEFAULT_ENV migraphx-ci
RUN conda create -y -n ${CONDA_DEFAULT_ENV} python=3.9
RUN conda create -y -n ${CONDA_DEFAULT_ENV} python=3.10
ENV PATH ${CONDA_ENVIRONMENT_PATH}/bin:${PATH}
# Enable migraphx-ci environment
@ -80,4 +80,4 @@ RUN ln -sf /usr/lib/x86_64-linux-gnu/libstdc++.so.6 ${CONDA_ENVIRONMENT_PATH}/bi
# Install migraphx
RUN apt update && apt install -y migraphx
RUN pip install numpy packaging ml_dtypes==0.3.0
RUN pip install numpy packaging ml_dtypes==0.5.0

View file

@ -1,7 +1,7 @@
# Refer to https://github.com/RadeonOpenCompute/ROCm-docker/blob/master/dev/Dockerfile-ubuntu-22.04-complete
FROM ubuntu:22.04
ARG ROCM_VERSION=6.0
ARG ROCM_VERSION=6.1.3
ARG AMDGPU_VERSION=${ROCM_VERSION}
ARG APT_PREF='Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600'
@ -67,26 +67,30 @@ RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86
# Create rocm-ci environment
ENV CONDA_ENVIRONMENT_PATH /opt/miniconda/envs/rocm-ci
ENV CONDA_DEFAULT_ENV rocm-ci
RUN conda create -y -n ${CONDA_DEFAULT_ENV} python=3.9
RUN conda create -y -n ${CONDA_DEFAULT_ENV} python=3.10
ENV PATH ${CONDA_ENVIRONMENT_PATH}/bin:${PATH}
# Enable rocm-ci environment
SHELL ["conda", "run", "-n", "rocm-ci", "/bin/bash", "-c"]
# ln -sf is needed to make sure that version `GLIBCXX_3.4.30' is found
# Some DLLs in the conda environment have conflict with the one installed in Ubuntu system.
# For example, the GCC version in the conda environment is 12.x, while the one in the Ubuntu 22.04 is 11.x.
# ln -sf to make sure we always use libstdc++.so.6 and libgcc_s.so.1 in the system.
RUN ln -sf /usr/lib/x86_64-linux-gnu/libstdc++.so.6 ${CONDA_ENVIRONMENT_PATH}/bin/../lib/libstdc++.so.6
RUN ln -sf /usr/lib/x86_64-linux-gnu/libgcc_s.so.1 ${CONDA_ENVIRONMENT_PATH}/bin/../lib/libgcc_s.so.1
RUN pip install packaging \
ml_dtypes==0.3.0 \
ml_dtypes==0.5.0 \
pytest==7.4.4 \
pytest-xdist \
pytest-rerunfailures \
scipy==1.10.0 \
numpy==1.24.1
scipy==1.14.1 \
numpy==1.26.4
RUN apt install -y git
# Install Cupy to decrease CPU utilization
# Note that the version of Cupy requires numpy < 1.27
RUN git clone https://github.com/ROCm/cupy && cd cupy && \
git checkout 432a8683351d681e00903640489cb2f4055d2e09 && \
export CUPY_INSTALL_USE_HIP=1 && \

View file

@ -2,7 +2,7 @@
set -e -x
# version
ROCM_VERSION=6.0
ROCM_VERSION=6.2.3
while getopts "r:" parameter_Option
do case "${parameter_Option}"