[ROCm] Build ROCm CI with Release config and enable kernel explorer test (#13687)

### Description
<!-- Describe your changes. -->
1. Build ROCm CI with Release config to save time.
2. use 32 threads to build, we have 256 threads on new CI machine.
3. enable ROCm kernel explorer test.


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->

Co-authored-by: peixuanzuo <peixuanzuo@linmif39a000004.zvflicr54joexhdgnhvmxrxygg.phxx.internal.cloudapp.net>
This commit is contained in:
PeixuanZuo 2022-11-21 10:04:10 +08:00 committed by GitHub
parent 8472876155
commit da2bd3ad4d
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 27 additions and 18 deletions

View file

@ -46,7 +46,7 @@ jobs:
--workdir /onnxruntime_src \
onnxruntimetrainingmigraphx-cibuild-rocm$(RocmVersion) \
python tools/ci_build/build.py \
--config RelWithDebInfo \
--config Release \
--cmake_extra_defines \
CMAKE_HIP_COMPILER=/opt/rocm/llvm/bin/clang++ \
--mpi_home /opt/ompi \
@ -57,7 +57,7 @@ jobs:
--update \
--build_dir /build \
--build \
--parallel 16 \
--parallel 32 \
--build_wheel \
--skip_submodule_sync \
--skip_tests
@ -79,7 +79,7 @@ jobs:
--user $UID:$(id -g $USER) \
--volume $(Build.SourcesDirectory):/onnxruntime_src \
--volume $(Build.BinariesDirectory):/build \
--workdir /build/RelWithDebInfo \
--workdir /build/Release \
onnxruntimetrainingmigraphx-cibuild-rocm$(RocmVersion) \
/onnxruntime_src/tools/ci_build/github/pai/migraphx_test_launcher.sh
workingDirectory: $(Build.SourcesDirectory)

View file

@ -51,12 +51,13 @@ jobs:
script: |-
export ROCM_HOME=/opt/rocm
python tools/ci_build/build.py \
--config RelWithDebInfo \
--config Release \
--enable_training \
--enable_training_torch_interop \
--mpi_home /opt/ompi \
--cmake_extra_defines \
CMAKE_HIP_COMPILER=${ROCM_HOME}/llvm/bin/clang++ \
onnxruntime_BUILD_KERNEL_EXPLORER=ON \
--use_rocm \
--rocm_version=$(RocmVersion) \
--rocm_home ${ROCM_HOME} \
@ -64,7 +65,7 @@ jobs:
--update \
--build_dir ./build \
--build \
--parallel 16 \
--parallel 32 \
--build_wheel \
--skip_tests
displayName: 'Build onnxruntime'
@ -76,14 +77,22 @@ jobs:
- task: CmdLine@2
inputs:
script: |-
cd ./build/RelWithDebInfo &&\
cd ./build/Release &&\
../../tools/ci_build/github/pai/pai_test_launcher.sh
displayName: 'Run onnxruntime unit tests'
- task: CmdLine@2
inputs:
script: |-
cd ./build/RelWithDebInfo
export KERNEL_EXPLORER_BUILD_DIR=./build/Release
pytest ./onnxruntime/python/tools/kernel_explorer/
displayName: 'Run kernel explorer tests'
condition: and(succeededOrFailed(), eq(variables.onnxruntimeBuildSucceeded, 'true'))
- task: CmdLine@2
inputs:
script: |-
cd ./build/Release
export PYTHONPATH=$PWD
python -m onnxruntime.training.ortmodule.torch_cpp_extensions.install
displayName: 'Compile torch extensions into build directory'
@ -92,7 +101,7 @@ jobs:
- task: CmdLine@2
inputs:
script: |-
cd ./build/RelWithDebInfo
cd ./build/Release
export PYTHONPATH=$PWD
export ORTMODULE_ONNX_OPSET_VERSION=15
python \
@ -120,7 +129,7 @@ jobs:
- task: CmdLine@2
inputs:
script: |-
cd ./build/RelWithDebInfo
cd ./build/Release
export PYTHONPATH=$PWD
export ORTMODULE_ONNX_OPSET_VERSION=15
python \
@ -147,7 +156,7 @@ jobs:
condition: and(succeededOrFailed(), eq(variables.onnxruntimeBuildSucceeded, 'true')) # ensure all tests are run when the build successed
# - script: |-
# cd ./build/RelWithDebInfo
# cd ./build/Release
# export PYTHONPATH=$PWD
# python \
# /stage/huggingface-transformers/examples/pytorch/translation/run_translation.py \
@ -176,7 +185,7 @@ jobs:
# todo: investigate RoBERTa high run variability on ROCm 5.2
#- script: |-
# cd ./build/RelWithDebInfo
# cd ./build/Release
# export PYTHONPATH=$PWD
# python \
# /stage/huggingface-transformers/examples/pytorch/question-answering/run_qa.py \
@ -204,7 +213,7 @@ jobs:
- task: CmdLine@2
inputs:
script: |-
cd ./build/RelWithDebInfo
cd ./build/Release
export PYTHONPATH=$PWD
export ORTMODULE_ONNX_OPSET_VERSION=15
python \
@ -230,7 +239,7 @@ jobs:
condition: and(succeededOrFailed(), eq(variables.onnxruntimeBuildSucceeded, 'true')) # ensure all tests are run when the build successed
#- script: |-
# cd ./build/RelWithDebInfo
# cd ./build/Release
# export PYTHONPATH=$PWD
# python \
# /stage/huggingface-transformers/examples/pytorch/text-classification/run_glue.py \
@ -252,7 +261,7 @@ jobs:
# condition: succeededOrFailed() # ensure all tests are run
#- script: |-
# cd ./build/RelWithDebInfo
# cd ./build/Release
# export PYTHONPATH=$PWD
# python \
# /stage/huggingface-transformers/examples/pytorch/translation/run_translation.py \
@ -297,7 +306,7 @@ jobs:
inputs:
script: |-
python orttraining/tools/ci_test/run_batch_size_test.py \
--binary_dir build/RelWithDebInfo \
--binary_dir build/Release \
--model_root training_e2e_test_data/models \
--gpu_sku MI100_32G
displayName: 'Run C++ BERT-L batch size test'
@ -308,7 +317,7 @@ jobs:
inputs:
script: |-
python orttraining/tools/ci_test/run_bert_perf_test.py \
--binary_dir build/RelWithDebInfo \
--binary_dir build/Release \
--model_root training_e2e_test_data/models \
--training_data_root training_e2e_test_data/data \
--gpu_sku MI100_32G
@ -320,7 +329,7 @@ jobs:
inputs:
script: |-
python orttraining/tools/ci_test/run_convergence_test.py \
--binary_dir build/RelWithDebInfo \
--binary_dir build/Release \
--model_root training_e2e_test_data/models \
--training_data_root training_e2e_test_data/data \
--gpu_sku MI100_32G
@ -359,7 +368,7 @@ jobs:
# - task: CmdLine@2
# inputs:
# script: |-
# cd ./build/RelWithDebInfo
# cd ./build/Release
# unset PYTHONPATH
# rm -rf onnxruntime
# pip install ./dist/onnxruntime*.whl