mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-06-30 03:37:44 +00:00
405 lines
16 KiB
YAML
405 lines
16 KiB
YAML
trigger: none
|
|
|
|
name: 'orttraining_ci_$(Date:yyyyMMdd)_$(Rev:r)'
|
|
jobs:
|
|
- job: AMD_CI
|
|
workspace:
|
|
clean: all
|
|
pool: 'AMD-GPU'
|
|
timeoutInMinutes: 150
|
|
|
|
# gid of video and render group on gcramdrr1-mi100-085 and -86
|
|
variables:
|
|
- name: video
|
|
value: 44
|
|
- name: render
|
|
value: 109
|
|
- name: onnxruntimeBuildSucceeded
|
|
value: false
|
|
- name: RocmVersion
|
|
value: 5.4
|
|
- name: CCACHE_DIR
|
|
value: $(Pipeline.Workspace)/ccache
|
|
- name: TODAY
|
|
value: $[format('{0:dd}{0:MM}{0:yyyy}', pipeline.startTime)]
|
|
|
|
# generated from tools/ci_build/github/pai/rocm-ci-pipeline-env.Dockerfile
|
|
container:
|
|
image: onnxruntimecibuildenvironment.azurecr.io/rocm-ci-pipeline-env:rocm$(RocmVersion)
|
|
endpoint: onnxruntimecibuildenvironmentforamd
|
|
options: --privileged -e HIP_VISIBLE_DEVICES --security-opt seccomp=unconfined --shm-size=1024m --device=/dev/kfd --device=/dev/dri --group-add $(video) --group-add $(render)
|
|
|
|
steps:
|
|
- checkout: self
|
|
clean: true
|
|
|
|
- script: |-
|
|
echo "Selecting GPU based on HIP_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES"
|
|
displayName: 'Initialize environment'
|
|
|
|
#- script: |-
|
|
# sed -i 's|session_options.use_deterministic_compute = False|session_options.use_deterministic_compute = True|g' \
|
|
# orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py
|
|
# displayName: 'Toggle ON deterministic compute mode for ORTModule'
|
|
|
|
- task: CmdLine@2
|
|
displayName: 'Check ROCm Environment'
|
|
inputs:
|
|
script: |-
|
|
echo $(Agent.Name)
|
|
bash ./tools/ci_build/github/pai/pai_get_thread.sh $(Agent.Name)
|
|
target: host
|
|
|
|
- task: Cache@2
|
|
inputs:
|
|
key: '"$(TODAY)" | "$(Build.SourceBranch)" | "$(Build.SourceVersion)"'
|
|
path: $(CCACHE_DIR)
|
|
cacheHitVar: CACHE_RESTORED
|
|
restoreKeys: |
|
|
"$(TODAY)" | "$(Build.SourceBranch)"
|
|
"$(TODAY)" |
|
|
displayName: Cache Task
|
|
|
|
- script: mkdir -p $(CCACHE_DIR)
|
|
condition: ne(variables.CACHE_RESTORED, 'true')
|
|
displayName: Create Cache Dir
|
|
|
|
- script: ccache -z && ccache -s
|
|
displayName: Show Cache Stats Before Building
|
|
|
|
- task: CmdLine@2
|
|
inputs:
|
|
script: |-
|
|
export ROCM_HOME=/opt/rocm
|
|
python tools/ci_build/build.py \
|
|
--config Release \
|
|
--enable_training \
|
|
--mpi_home /opt/ompi \
|
|
--cmake_extra_defines \
|
|
CMAKE_HIP_COMPILER=${ROCM_HOME}/llvm/bin/clang++ \
|
|
onnxruntime_BUILD_KERNEL_EXPLORER=ON \
|
|
--use_cache \
|
|
--use_rocm \
|
|
--rocm_version=$(RocmVersion) \
|
|
--rocm_home ${ROCM_HOME} \
|
|
--nccl_home ${ROCM_HOME}\
|
|
--update \
|
|
--build_dir ./build \
|
|
--build \
|
|
--parallel 32 \
|
|
--build_wheel \
|
|
--skip_tests
|
|
displayName: 'Build onnxruntime'
|
|
|
|
- script: ccache -s
|
|
displayName: Show Cache Stats After Building
|
|
|
|
- bash: |-
|
|
echo "##vso[task.setvariable variable=onnxruntimeBuildSucceeded]true"
|
|
displayName: 'Set Onnxruntime Build Succeeded'
|
|
|
|
- task: CmdLine@2
|
|
inputs:
|
|
script: |-
|
|
cd ./build/Release &&\
|
|
../../tools/ci_build/github/pai/pai_test_launcher.sh
|
|
displayName: 'Run onnxruntime unit tests'
|
|
|
|
- task: CmdLine@2
|
|
inputs:
|
|
script: |-
|
|
export KERNEL_EXPLORER_BUILD_DIR=./build/Release
|
|
pytest ./onnxruntime/python/tools/kernel_explorer/ -n 16 --reruns 1
|
|
displayName: 'Run kernel explorer tests'
|
|
condition: and(succeededOrFailed(), eq(variables.onnxruntimeBuildSucceeded, 'true'))
|
|
|
|
- task: CmdLine@2
|
|
inputs:
|
|
script: |-
|
|
cd ./build/Release
|
|
export PYTHONPATH=$PWD
|
|
python -m onnxruntime.training.ortmodule.torch_cpp_extensions.install
|
|
displayName: 'Compile torch extensions into build directory'
|
|
condition: and(succeededOrFailed(), eq(variables.onnxruntimeBuildSucceeded, 'true')) # ensure all tests are run when the build successed
|
|
|
|
- task: CmdLine@2
|
|
inputs:
|
|
script: |-
|
|
cd ./build/Release
|
|
export PYTHONPATH=$PWD
|
|
export ORTMODULE_ONNX_OPSET_VERSION=15
|
|
python \
|
|
/stage/huggingface-transformers/examples/pytorch/language-modeling/run_mlm.py \
|
|
--model_name_or_path bert-large-uncased \
|
|
--dataset_name wikitext \
|
|
--dataset_config_name wikitext-2-raw-v1 \
|
|
--do_train \
|
|
--max_steps 260 \
|
|
--logging_steps 20 \
|
|
--output_dir ./test-mlm-bbu \
|
|
--overwrite_output_dir \
|
|
--per_device_train_batch_size 8 \
|
|
--fp16 \
|
|
--dataloader_num_workers 1 \
|
|
--ort \
|
|
--skip_memory_metrics
|
|
python ../../orttraining/tools/ci_test/compare_huggingface.py \
|
|
ci-pipeline-actual.json \
|
|
../../orttraining/tools/ci_test/results/ci-mi100.huggingface.bert-large-rocm$(RocmVersion).json
|
|
displayName: 'Run Python Hugging-Face BERT-L test'
|
|
retryCountOnTaskFailure: 1
|
|
condition: and(succeededOrFailed(), eq(variables.onnxruntimeBuildSucceeded, 'true')) # ensure all tests are run when the build successed
|
|
|
|
- task: CmdLine@2
|
|
inputs:
|
|
script: |-
|
|
cd ./build/Release
|
|
export PYTHONPATH=$PWD
|
|
export ORTMODULE_ONNX_OPSET_VERSION=15
|
|
python \
|
|
/stage/huggingface-transformers/examples/pytorch/language-modeling/run_clm.py \
|
|
--model_name_or_path gpt2 \
|
|
--dataset_name wikitext \
|
|
--dataset_config_name wikitext-2-raw-v1 \
|
|
--do_train \
|
|
--label_smoothing 0.1 \
|
|
--max_steps 260 \
|
|
--logging_steps 20 \
|
|
--overwrite_output_dir \
|
|
--output_dir ./test-clm \
|
|
--per_device_train_batch_size 8 \
|
|
--fp16 \
|
|
--dataloader_num_workers 1 \
|
|
--ort \
|
|
--skip_memory_metrics
|
|
python ../../orttraining/tools/ci_test/compare_huggingface.py \
|
|
ci-pipeline-actual.json \
|
|
../../orttraining/tools/ci_test/results/ci-mi100.huggingface.gpt2-rocm$(RocmVersion).json
|
|
displayName: 'Run Python Hugging-Face GPT2 test'
|
|
retryCountOnTaskFailure: 1
|
|
condition: and(succeededOrFailed(), eq(variables.onnxruntimeBuildSucceeded, 'true')) # ensure all tests are run when the build successed
|
|
|
|
# - script: |-
|
|
# cd ./build/Release
|
|
# export PYTHONPATH=$PWD
|
|
# python \
|
|
# /stage/huggingface-transformers/examples/pytorch/translation/run_translation.py \
|
|
# --dataset_name wmt16 \
|
|
# --dataset_config ro-en \
|
|
# --model_name_or_path facebook/bart-large \
|
|
# --output_dir ./tst-translation \
|
|
# --do_train \
|
|
# --label_smoothing 0.1 \
|
|
# --logging_steps 20 \
|
|
# --overwrite_output_dir \
|
|
# --per_device_train_batch_size 16 \
|
|
# --predict_with_generate \
|
|
# --source_lang en --target_lang ro \
|
|
# --warmup_steps 5 \
|
|
# --fp16 \
|
|
# --max_steps 260 \
|
|
# --dataloader_num_workers 1 \
|
|
# --ort \
|
|
# --skip_memory_metrics
|
|
# python ../../orttraining/tools/ci_test/compare_huggingface.py \
|
|
# ci-pipeline-actual.json \
|
|
# ../../orttraining/tools/ci_test/results/ci-mi100.huggingface.bart-large-rocm5.2.json
|
|
# displayName: 'Run Python Hugging-Face BART-L test'
|
|
# condition: succeededOrFailed() # ensure all tests are run
|
|
|
|
# todo: investigate RoBERTa high run variability on ROCm 5.2
|
|
#- script: |-
|
|
# cd ./build/Release
|
|
# export PYTHONPATH=$PWD
|
|
# python \
|
|
# /stage/huggingface-transformers/examples/pytorch/question-answering/run_qa.py \
|
|
# --model_name_or_path roberta-large \
|
|
# --dataset_name squad \
|
|
# --do_train \
|
|
# --per_device_train_batch_size 16 \
|
|
# --learning_rate 3e-5 \
|
|
# --max_steps 260 \
|
|
# --max_seq_length 384 \
|
|
# --doc_stride 128 \
|
|
# --output_dir ./roberta_res \
|
|
# --overwrite_output_dir \
|
|
# --logging_steps 20 \
|
|
# --fp16 \
|
|
# --dataloader_num_workers 1 \
|
|
# --ort \
|
|
# --skip_memory_metrics
|
|
# python ../../orttraining/tools/ci_test/compare_huggingface.py \
|
|
# ci-pipeline-actual.json \
|
|
# ../../orttraining/tools/ci_test/results/ci-mi100.huggingface.roberta-large-rocm5.2.json
|
|
# displayName: 'Run Python Hugging-Face RoBERTa-L test'
|
|
# condition: succeededOrFailed() # ensure all tests are run
|
|
|
|
- task: CmdLine@2
|
|
inputs:
|
|
script: |-
|
|
cd ./build/Release
|
|
export PYTHONPATH=$PWD
|
|
export ORTMODULE_ONNX_OPSET_VERSION=15
|
|
python \
|
|
/stage/huggingface-transformers/examples/pytorch/language-modeling/run_mlm.py \
|
|
--model_name_or_path distilbert-base-uncased \
|
|
--dataset_name wikitext \
|
|
--dataset_config_name wikitext-2-raw-v1 \
|
|
--do_train \
|
|
--max_steps 260 \
|
|
--logging_steps 20 \
|
|
--output_dir ./test-mlm-bbu \
|
|
--overwrite_output_dir \
|
|
--per_device_train_batch_size 32 \
|
|
--fp16 \
|
|
--dataloader_num_workers 1 \
|
|
--ort \
|
|
--skip_memory_metrics
|
|
python ../../orttraining/tools/ci_test/compare_huggingface.py \
|
|
ci-pipeline-actual.json \
|
|
../../orttraining/tools/ci_test/results/ci-mi100.huggingface.distilbert-base-rocm$(RocmVersion).json
|
|
displayName: 'Run Python Hugging-Face DistilBERT test'
|
|
retryCountOnTaskFailure: 1
|
|
condition: and(succeededOrFailed(), eq(variables.onnxruntimeBuildSucceeded, 'true')) # ensure all tests are run when the build successed
|
|
|
|
#- script: |-
|
|
# cd ./build/Release
|
|
# export PYTHONPATH=$PWD
|
|
# python \
|
|
# /stage/huggingface-transformers/examples/pytorch/text-classification/run_glue.py \
|
|
# --model_name_or_path microsoft/deberta-v2-xxlarge \
|
|
# --task_name MRPC \
|
|
# --do_train \
|
|
# --max_seq_length 128 \
|
|
# --per_device_train_batch_size 4 \
|
|
# --learning_rate 3e-6 \
|
|
# --max_steps 260 \
|
|
# --output_dir ./deberta_res \
|
|
# --overwrite_output_dir \
|
|
# --logging_steps 20 \
|
|
# --fp16 \
|
|
# --dataloader_num_workers 1 \
|
|
# --ort \
|
|
# --skip_memory_metrics
|
|
# displayName: 'Run Python Hugging-Face DeBERTa-XXL v2 test'
|
|
# condition: succeededOrFailed() # ensure all tests are run
|
|
|
|
#- script: |-
|
|
# cd ./build/Release
|
|
# export PYTHONPATH=$PWD
|
|
# python \
|
|
# /stage/huggingface-transformers/examples/pytorch/translation/run_translation.py \
|
|
# --source_prefix '"translate English to Romanian:"' \
|
|
# --dataset_name wmt16 \
|
|
# --dataset_config ro-en \
|
|
# --model_name_or_path t5-large \
|
|
# --output_dir ./tst-translation \
|
|
# --do_train \
|
|
# --label_smoothing 0.1 \
|
|
# --logging_steps 20 \
|
|
# --overwrite_output_dir \
|
|
# --per_device_train_batch_size 16 \
|
|
# --predict_with_generate \
|
|
# --source_lang en \
|
|
# --target_lang ro \
|
|
# --warmup_steps 5 \
|
|
# --fp16 \
|
|
# --max_steps 260 \
|
|
# --dataloader_num_workers 1 \
|
|
# --ort \
|
|
# --skip_memory_metrics
|
|
# python ../../orttraining/tools/ci_test/compare_huggingface.py \
|
|
# ci-pipeline-actual.json \
|
|
# ../../orttraining/tools/ci_test/results/ci-mi100.huggingface.t5-large.json
|
|
# displayName: 'Run Python Hugging-Face T5-L test'
|
|
# condition: succeededOrFailed() # ensure all tests are run
|
|
|
|
- task: CmdLine@2
|
|
inputs:
|
|
script: |-
|
|
export AZURE_BLOB_SAS_TOKEN="$(onnxruntimetestdata-storage-training-container-sas-token)"
|
|
python orttraining/tools/ci_test/download_azure_blob_archive.py \
|
|
--azure_blob_url https://onnxruntimetestdata.blob.core.windows.net/training/onnxruntime_training_data.zip \
|
|
--target_dir training_e2e_test_data \
|
|
--archive_sha256_digest B01C169B6550D1A0A6F1B4E2F34AE2A8714B52DBB70AC04DA85D371F691BDFF9
|
|
condition: and(succeededOrFailed(), eq(variables.onnxruntimeBuildSucceeded, 'true')) # ensure all tests are run when the build successed
|
|
retryCountOnTaskFailure: 2
|
|
displayName: 'Download onnxruntime_training_data.zip data'
|
|
|
|
- task: CmdLine@2
|
|
inputs:
|
|
script: |-
|
|
python orttraining/tools/ci_test/run_batch_size_test.py \
|
|
--binary_dir build/Release \
|
|
--model_root training_e2e_test_data/models \
|
|
--gpu_sku MI100_32G
|
|
displayName: 'Run C++ BERT-L batch size test'
|
|
retryCountOnTaskFailure: 1
|
|
condition: and(succeededOrFailed(), eq(variables.onnxruntimeBuildSucceeded, 'true')) # ensure all tests are run when the build successed
|
|
|
|
- task: CmdLine@2
|
|
inputs:
|
|
script: |-
|
|
python orttraining/tools/ci_test/run_bert_perf_test.py \
|
|
--binary_dir build/Release \
|
|
--model_root training_e2e_test_data/models \
|
|
--training_data_root training_e2e_test_data/data \
|
|
--gpu_sku MI100_32G
|
|
displayName: 'Run C++ BERT-L performance test'
|
|
retryCountOnTaskFailure: 1
|
|
condition: and(succeededOrFailed(), eq(variables.onnxruntimeBuildSucceeded, 'true')) # ensure all tests are run when the build successed
|
|
|
|
- task: CmdLine@2
|
|
inputs:
|
|
script: |-
|
|
python orttraining/tools/ci_test/run_convergence_test.py \
|
|
--binary_dir build/Release \
|
|
--model_root training_e2e_test_data/models \
|
|
--training_data_root training_e2e_test_data/data \
|
|
--gpu_sku MI100_32G
|
|
displayName: 'Run C++ BERT-L convergence test'
|
|
retryCountOnTaskFailure: 1
|
|
condition: and(succeededOrFailed(), eq(variables.onnxruntimeBuildSucceeded, 'true')) # ensure all tests are run when the build successed
|
|
|
|
- script: |
|
|
sudo apt-get update
|
|
sudo apt install -y cifs-utils
|
|
displayName: 'Install filesystems util'
|
|
condition: and(succeededOrFailed(), eq(variables.onnxruntimeBuildSucceeded, 'true'))
|
|
|
|
- bash: tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh -p $(orttrainingtestdatascus-storage-key) -s "//orttrainingtestdatascus.file.core.windows.net/mnist" -d "/mnist"
|
|
displayName: 'Mount MNIST'
|
|
condition: and(succeededOrFailed(), eq(variables.onnxruntimeBuildSucceeded, 'true'))
|
|
|
|
- bash: tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh -p $(orttrainingtestdatascus-storage-key) -s "//orttrainingtestdatascus.file.core.windows.net/bert-data" -d "/bert_data"
|
|
displayName: 'Mount bert-data'
|
|
condition: and(succeededOrFailed(), eq(variables.onnxruntimeBuildSucceeded, 'true'))
|
|
|
|
- bash: tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh -p $(orttrainingtestdatascus-storage-key) -s "//orttrainingtestdatascus.file.core.windows.net/hf-models-cache" -d "/hf_models_cache"
|
|
displayName: 'Mount hf-models-cache'
|
|
condition: and(succeededOrFailed(), eq(variables.onnxruntimeBuildSucceeded, 'true'))
|
|
|
|
|
|
# Entry point for all ORTModule tests
|
|
# The onnxruntime folder is deleted in the build directory
|
|
# to enforce use of the onnxruntime wheel
|
|
#
|
|
# For every upgrade, we should enable it to test it once, and disable it.
|
|
#
|
|
# TODO - we may consider to cherry pick subset of the UTs to reduce the CI time.
|
|
# We can re-enable this part again when it's done.
|
|
|
|
# - task: CmdLine@2
|
|
# inputs:
|
|
# script: |-
|
|
# cd ./build/Release
|
|
# unset PYTHONPATH
|
|
# rm -rf onnxruntime
|
|
# pip install ./dist/onnxruntime*.whl
|
|
# python -m onnxruntime.training.ortmodule.torch_cpp_extensions.install
|
|
# python orttraining_ortmodule_tests.py \
|
|
# --mnist /mnist \
|
|
# --bert_data /bert_data/hf_data/glue_data/CoLA/original/raw \
|
|
# --transformers_cache /hf_models_cache/huggingface/transformers
|
|
# displayName: 'Run orttraining_ortmodule_tests.py'
|
|
# condition: and(succeededOrFailed(), eq(variables.onnxruntimeBuildSucceeded, 'true'))
|