onnxruntime/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml
kailums 1b38c05544
change ci docker image to rocm6.1 (#21296)
### Description
<!-- Describe your changes. -->
There is a bug for kernel running on rocm6.0, so change ci docker image
to rocm6.1

For the torch installed in the docker image, change to rocm repo when it
is not 6.0 version.

### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
2024-07-18 14:50:01 +08:00

192 lines
5.6 KiB
YAML

##### start trigger Don't edit it manually, Please do edit set-trigger-rules.py ####
trigger:
branches:
include:
- main
- rel-*
paths:
exclude:
- docs/**
- README.md
- CONTRIBUTING.md
- BUILD.md
- 'js/web'
- 'onnxruntime/core/providers/js'
pr:
branches:
include:
- main
- rel-*
paths:
exclude:
- docs/**
- README.md
- CONTRIBUTING.md
- BUILD.md
- 'js/web'
- 'onnxruntime/core/providers/js'
#### end trigger ####
name: 'linux_ci_$(Date:yyyyMMdd)_$(Rev:r)'
# gid of video and render group on gcramdrr1-mi100-085 and -86
variables:
- name: video
value: 44
- name: render
value: 109
- name: RocmVersion
value: 6.1
- name: RocmVersionPatchSuffix
value: ".3"
jobs:
- job: Linux_Build
variables:
skipComponentGovernanceDetection: true
CCACHE_DIR: $(Pipeline.Workspace)/ccache
TODAY: $[format('{0:dd}{0:MM}{0:yyyy}', pipeline.startTime)]
workspace:
clean: all
pool: onnxruntime-Ubuntu2204-AMD-CPU
timeoutInMinutes: 240
steps:
- task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
displayName: 'Clean Agent Directories'
condition: always()
- checkout: self
clean: true
submodules: recursive
- template: templates/get-docker-image-steps.yml
parameters:
Dockerfile: tools/ci_build/github/linux/docker/migraphx-ci-pipeline-env.Dockerfile
Context: tools/ci_build/github/linux/docker
DockerBuildArgs: "--build-arg ROCM_VERSION=$(RocmVersion)$(RocmVersionPatchSuffix)"
Repository: onnxruntimetrainingmigraphx-cibuild-rocm$(RocmVersion)
- task: Cache@2
inputs:
key: '"$(TODAY)" | "$(Build.SourceBranch)" | "$(Build.SourceVersion)"'
path: $(CCACHE_DIR)
cacheHitVar: CACHE_RESTORED
restoreKeys: |
"$(TODAY)" | "$(Build.SourceBranch)"
"$(TODAY)" |
displayName: Cache Task
- script: mkdir -p $(CCACHE_DIR)
condition: ne(variables.CACHE_RESTORED, 'true')
displayName: Create Cache Dir
- task: CmdLine@2
inputs:
script: |
docker run --rm \
--security-opt seccomp=unconfined \
--shm-size=1024m \
--user $UID:$(id -g $USER) \
--volume $(Build.SourcesDirectory):/onnxruntime_src \
--volume $(Build.BinariesDirectory):/build \
--volume $(CCACHE_DIR):/cache \
-e CCACHE_DIR=/cache \
--workdir /onnxruntime_src \
onnxruntimetrainingmigraphx-cibuild-rocm$(RocmVersion) \
/bin/bash -c "
set -ex; \
env; \
ccache -s; \
python tools/ci_build/build.py \
--config Release \
--enable_training \
--cmake_extra_defines \
CMAKE_HIP_COMPILER=/opt/rocm/llvm/bin/clang++ \
onnxruntime_BUILD_KERNEL_EXPLORER=OFF \
onnxruntime_USE_COMPOSABLE_KERNEL=OFF \
--mpi_home /opt/ompi \
--use_migraphx \
--rocm_version=$(RocmVersion) \
--rocm_home /opt/rocm \
--nccl_home /opt/rocm \
--enable_nccl \
--update \
--build_dir /build \
--build \
--parallel \
--build_wheel \
--skip_submodule_sync \
--use_cache \
--skip_tests --cmake_path /usr/bin/cmake --ctest_path /usr/bin/ctest; \
ccache -sv; \
ccache -z"
workingDirectory: $(Build.SourcesDirectory)
displayName: 'Build onnxruntime'
- task: CmdLine@2
inputs:
script: |
cd $(Build.BinariesDirectory)/Release
find -executable -type f > $(Build.BinariesDirectory)/Release/perms.txt
displayName: 'Find Executable Files'
- task: PublishPipelineArtifact@0
displayName: 'Publish Pipeline Artifact'
inputs:
artifactName: 'drop-linux'
targetPath: '$(Build.BinariesDirectory)/Release'
- template: templates/explicitly-defined-final-tasks.yml
- job: Linux_Test
workspace:
clean: all
pool: AMD-GPU
dependsOn:
- Linux_Build
timeoutInMinutes: 120
steps:
- task: DownloadPipelineArtifact@2
displayName: 'Download Pipeline Artifact'
inputs:
buildType: 'current'
artifactName: 'drop-linux'
targetPath: '$(Build.BinariesDirectory)/Release'
- checkout: self
clean: true
submodules: recursive
- template: templates/get-docker-image-steps.yml
parameters:
Dockerfile: tools/ci_build/github/linux/docker/migraphx-ci-pipeline-env.Dockerfile
Context: tools/ci_build/github/linux/docker
DockerBuildArgs: "--build-arg ROCM_VERSION=$(RocmVersion)$(RocmVersionPatchSuffix)"
Repository: onnxruntimetrainingmigraphx-cibuild-rocm$(RocmVersion)
- task: CmdLine@2
inputs:
script: |
docker run --rm \
--security-opt seccomp=unconfined \
--shm-size=1024m \
--device=/dev/kfd \
--device=/dev/dri/renderD$DRIVER_RENDER \
--group-add $(video) \
--group-add $(render) \
--user $UID:$(id -g $USER) \
--volume $(Build.SourcesDirectory):/onnxruntime_src \
--volume $(Build.BinariesDirectory):/build \
--workdir /build/Release \
onnxruntimetrainingmigraphx-cibuild-rocm$(RocmVersion) \
/bin/bash -c "
set -ex; \
cd /build/Release && xargs -a /build/Release/perms.txt chmod a+x; \
bash /onnxruntime_src/tools/ci_build/github/pai/pai_test_launcher.sh"
workingDirectory: $(Build.SourcesDirectory)
displayName: 'Run onnxruntime unit tests'
- template: templates/clean-agent-build-directory-step.yml