onnxruntime/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-amd-e2e-test-ci-pipeline.yml
Suffian Khan 9f14af9809
Add BERT-L perf regression test on MI100 and re-enable batch size test (#7240)
* restore bs test and add perf test

* update perf number and fix path to results
2021-04-05 15:51:52 -07:00

77 lines
2.8 KiB
YAML

trigger: none
name: 'orttraining_amd_nightly_$(Date:yyyyMMdd)_$(Rev:r)'
pool: 'AMD-GPU'
jobs:
- job: Onnxruntime_Linux_GPU_AMD_Training_E2E_Test
timeoutInMinutes: 60
steps:
- checkout: self
clean: true
submodules: recursive
- script: |-
echo "##vso[task.prependpath]/home/ciagent/conda/bin/"
echo "##vso[task.prependpath]/home/ciagent/pkg/openmpi-4.0.5/bin/"
echo '##vso[task.setvariable variable=LD_LIBRARY_PATH]/home/ciagent/pkg/openmpi-4.0.5/lib/'
eval "$('/home/ciagent/conda/bin/conda' 'shell.bash' 'hook' 2> /dev/null)"
echo "Selecting GPU based on HIP_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES"
displayName: 'Initialize environment'
# update these if the E2E test data changes
- script: |-
python orttraining/tools/ci_test/download_azure_blob_archive.py \
--azure_blob_url https://onnxruntimetestdata.blob.core.windows.net/training/onnxruntime_training_data.zip?snapshot=2020-06-15T23:17:35.8314853Z \
--target_dir training_e2e_test_data \
--archive_sha256_digest B01C169B6550D1A0A6F1B4E2F34AE2A8714B52DBB70AC04DA85D371F691BDFF9
displayName: 'Download onnxruntime_training_data.zip data'
- script: |-
python tools/ci_build/build.py \
--config RelWithDebInfo \
--enable_training \
--mpi_home /home/ciagent/pkg/openmpi-4.0.5 \
--use_rocm \
--rocm_home /opt/rocm \
--nccl_home /opt/rocm \
--update \
--build_dir ./build \
--build \
--parallel 8 \
--build_wheel \
--skip_tests
displayName: 'Build onnxruntime'
- script: |-
cd ./build/RelWithDebInfo &&\
../../tools/ci_build/github/pai/pai_test_launcher.sh
displayName: 'Run onnxruntime unit tests'
- script: |-
python orttraining/tools/ci_test/run_batch_size_test.py \
--binary_dir build/RelWithDebInfo \
--model_root training_e2e_test_data/models \
--gpu_sku MI100_32G
displayName: 'Run C++ BERT-L batch size test'
condition: succeededOrFailed() # ensure all tests are run
- script: |-
python orttraining/tools/ci_test/run_bert_perf_test.py \
--binary_dir build/RelWithDebInfo \
--model_root training_e2e_test_data/models \
--training_data_root training_e2e_test_data/data \
--gpu_sku MI100_32G
displayName: 'Run C++ BERT-L performance test'
condition: succeededOrFailed() # ensure all tests are run
- script: |-
python orttraining/tools/ci_test/run_convergence_test.py \
--binary_dir build/RelWithDebInfo \
--model_root training_e2e_test_data/models \
--training_data_root training_e2e_test_data/data \
--gpu_sku MI100_32G
displayName: 'Run C++ BERT-L convergence test'
condition: succeededOrFailed() # ensure all tests are run