onnxruntime/tools/ci_build/github/azure-pipelines/orttraining-pai-ci-pipeline.yml

trigger: none

name: 'orttraining_ci_$(Date:yyyyMMdd)_$(Rev:r)'
jobs:
- job: AMD_CI
  workspace:
    clean: all
  pool: 'AMD-GPU'
  timeoutInMinutes: 150

  # gid of video and render group on gcramdrr1-mi100-085 and -86
  variables:
    - name: video
      value: 44
    - name: render
      value: 109
    - name: onnxruntimeBuildSucceeded
      value: false
    - name: RocmVersion
      value: 5.4
    - name: CCACHE_DIR
      value: $(Pipeline.Workspace)/ccache
    - name: TODAY
      value: $[format('{0:dd}{0:MM}{0:yyyy}', pipeline.startTime)]

  # generated from tools/ci_build/github/pai/rocm-ci-pipeline-env.Dockerfile
  container:
    image: onnxruntimecibuildenvironment.azurecr.io/rocm-ci-pipeline-env:rocm$(RocmVersion)
    endpoint: onnxruntimecibuildenvironmentforamd
    options: --privileged -e HIP_VISIBLE_DEVICES --security-opt seccomp=unconfined --shm-size=1024m --device=/dev/kfd --device=/dev/dri  --group-add $(video) --group-add $(render)

  steps:
  - checkout: self
    clean: true

  - script: |-
      echo "Selecting GPU based on HIP_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES"
    displayName: 'Initialize environment'

  #- script: |-
  #    sed -i 's|session_options.use_deterministic_compute = False|session_options.use_deterministic_compute = True|g' \
  #       orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py
  #  displayName: 'Toggle ON deterministic compute mode for ORTModule'

  - task: CmdLine@2
    displayName: 'Check ROCm Environment'
    inputs:
      script: |-
        echo $(Agent.Name)
        bash ./tools/ci_build/github/pai/pai_get_thread.sh $(Agent.Name)
    target: host

  - task: Cache@2
    inputs:
      key: '"$(TODAY)" | "$(Build.SourceBranch)" | "$(Build.SourceVersion)"'
      path: $(CCACHE_DIR)
      cacheHitVar: CACHE_RESTORED
      restoreKeys: |
        "$(TODAY)" | "$(Build.SourceBranch)"
        "$(TODAY)" |
    displayName: Cache Task

  - script: mkdir -p $(CCACHE_DIR)
    condition: ne(variables.CACHE_RESTORED, 'true')
    displayName: Create Cache Dir

  - script: ccache -z && ccache -s
    displayName: Show Cache Stats Before Building

  - task: CmdLine@2
    inputs:
      script: |-
        export ROCM_HOME=/opt/rocm
        python tools/ci_build/build.py \
          --config Release \
          --enable_training \
          --mpi_home /opt/ompi \
          --cmake_extra_defines \
              CMAKE_HIP_COMPILER=${ROCM_HOME}/llvm/bin/clang++ \
              onnxruntime_BUILD_KERNEL_EXPLORER=ON \
          --use_cache \
          --use_rocm \
          --rocm_version=$(RocmVersion) \
          --rocm_home ${ROCM_HOME} \
          --nccl_home ${ROCM_HOME}\
          --update \
          --build_dir ./build \
          --build \
          --parallel 32 \
          --build_wheel \
          --skip_tests
    displayName: 'Build onnxruntime'

  - script: ccache -s
    displayName: Show Cache Stats After Building

  - bash: |-
      echo "##vso[task.setvariable variable=onnxruntimeBuildSucceeded]true"
    displayName: 'Set Onnxruntime Build Succeeded'

  - task: CmdLine@2
    inputs:
      script: |-
        cd ./build/Release &&\
        ../../tools/ci_build/github/pai/pai_test_launcher.sh
    displayName: 'Run onnxruntime unit tests'

  - task: CmdLine@2
    inputs:
      script: |-
        export KERNEL_EXPLORER_BUILD_DIR=./build/Release
        pytest ./onnxruntime/python/tools/kernel_explorer/ -n 16 --reruns 1
    displayName: 'Run kernel explorer tests'
    condition: and(succeededOrFailed(), eq(variables.onnxruntimeBuildSucceeded, 'true'))

  - task: CmdLine@2
    inputs:
      script: |-
        cd ./build/Release
        export PYTHONPATH=$PWD
        python -m onnxruntime.training.ortmodule.torch_cpp_extensions.install
    displayName: 'Compile torch extensions into build directory'
    condition: and(succeededOrFailed(), eq(variables.onnxruntimeBuildSucceeded, 'true')) # ensure all tests are run when the build successed

  - task: CmdLine@2
    inputs:
      script: |-
        cd ./build/Release
        export PYTHONPATH=$PWD
        export ORTMODULE_ONNX_OPSET_VERSION=15
        python \
          /stage/huggingface-transformers/examples/pytorch/language-modeling/run_mlm.py \
          --model_name_or_path bert-large-uncased \
          --dataset_name wikitext \
          --dataset_config_name wikitext-2-raw-v1 \
          --do_train \
          --max_steps 260 \
          --logging_steps 20 \
          --output_dir ./test-mlm-bbu \
          --overwrite_output_dir \
          --per_device_train_batch_size 8 \
          --fp16 \
          --dataloader_num_workers 1 \
          --ort \
          --skip_memory_metrics
        python ../../orttraining/tools/ci_test/compare_huggingface.py \
          ci-pipeline-actual.json \
          ../../orttraining/tools/ci_test/results/ci-mi100.huggingface.bert-large-rocm$(RocmVersion).json
    displayName: 'Run Python Hugging-Face BERT-L test'
    retryCountOnTaskFailure: 1
    condition: and(succeededOrFailed(), eq(variables.onnxruntimeBuildSucceeded, 'true')) # ensure all tests are run when the build successed

  - task: CmdLine@2
    inputs:
      script: |-
        cd ./build/Release
        export PYTHONPATH=$PWD
        export ORTMODULE_ONNX_OPSET_VERSION=15
        python \
          /stage/huggingface-transformers/examples/pytorch/language-modeling/run_clm.py \
          --model_name_or_path gpt2 \
          --dataset_name wikitext \
          --dataset_config_name wikitext-2-raw-v1 \
          --do_train \
          --label_smoothing 0.1 \
          --max_steps 260 \
          --logging_steps 20 \
          --overwrite_output_dir \
          --output_dir ./test-clm \
          --per_device_train_batch_size 8 \
          --fp16 \
          --dataloader_num_workers 1 \
          --ort \
          --skip_memory_metrics
        python ../../orttraining/tools/ci_test/compare_huggingface.py \
          ci-pipeline-actual.json \
          ../../orttraining/tools/ci_test/results/ci-mi100.huggingface.gpt2-rocm$(RocmVersion).json
    displayName: 'Run Python Hugging-Face GPT2 test'
    retryCountOnTaskFailure: 1
    condition: and(succeededOrFailed(), eq(variables.onnxruntimeBuildSucceeded, 'true')) # ensure all tests are run when the build successed

#  - script: |-
#      cd ./build/Release
#      export PYTHONPATH=$PWD
#      python \
#        /stage/huggingface-transformers/examples/pytorch/translation/run_translation.py \
#        --dataset_name wmt16 \
#        --dataset_config ro-en \
#        --model_name_or_path facebook/bart-large \
#        --output_dir ./tst-translation \
#        --do_train \
#        --label_smoothing 0.1 \
#        --logging_steps 20 \
#        --overwrite_output_dir \
#        --per_device_train_batch_size 16 \
#        --predict_with_generate \
#        --source_lang en --target_lang ro \
#        --warmup_steps 5 \
#        --fp16 \
#        --max_steps 260 \
#        --dataloader_num_workers 1 \
#        --ort \
#        --skip_memory_metrics
#      python ../../orttraining/tools/ci_test/compare_huggingface.py \
#        ci-pipeline-actual.json \
#        ../../orttraining/tools/ci_test/results/ci-mi100.huggingface.bart-large-rocm5.2.json
#    displayName: 'Run Python Hugging-Face BART-L test'
#    condition: succeededOrFailed() # ensure all tests are run

  # todo: investigate RoBERTa high run variability on ROCm 5.2
  #- script: |-
  #    cd ./build/Release
  #    export PYTHONPATH=$PWD
  #    python \
  #      /stage/huggingface-transformers/examples/pytorch/question-answering/run_qa.py \
  #      --model_name_or_path roberta-large \
  #      --dataset_name squad \
  #      --do_train \
  #      --per_device_train_batch_size 16 \
  #      --learning_rate 3e-5 \
  #      --max_steps 260 \
  #      --max_seq_length 384 \
  #      --doc_stride 128 \
  #      --output_dir ./roberta_res \
  #      --overwrite_output_dir \
  #      --logging_steps 20 \
  #      --fp16 \
  #      --dataloader_num_workers 1 \
  #      --ort \
  #      --skip_memory_metrics
  #    python ../../orttraining/tools/ci_test/compare_huggingface.py \
  #      ci-pipeline-actual.json \
  #      ../../orttraining/tools/ci_test/results/ci-mi100.huggingface.roberta-large-rocm5.2.json
  #  displayName: 'Run Python Hugging-Face RoBERTa-L test'
  #  condition: succeededOrFailed() # ensure all tests are run

  - task: CmdLine@2
    inputs:
      script: |-
        cd ./build/Release
        export PYTHONPATH=$PWD
        export ORTMODULE_ONNX_OPSET_VERSION=15
        python \
          /stage/huggingface-transformers/examples/pytorch/language-modeling/run_mlm.py \
          --model_name_or_path distilbert-base-uncased \
          --dataset_name wikitext \
          --dataset_config_name wikitext-2-raw-v1 \
          --do_train \
          --max_steps 260 \
          --logging_steps 20 \
          --output_dir ./test-mlm-bbu \
          --overwrite_output_dir \
          --per_device_train_batch_size 32 \
          --fp16 \
          --dataloader_num_workers 1 \
          --ort \
          --skip_memory_metrics
        python ../../orttraining/tools/ci_test/compare_huggingface.py \
          ci-pipeline-actual.json \
          ../../orttraining/tools/ci_test/results/ci-mi100.huggingface.distilbert-base-rocm$(RocmVersion).json
    displayName: 'Run Python Hugging-Face DistilBERT test'
    retryCountOnTaskFailure: 1
    condition: and(succeededOrFailed(), eq(variables.onnxruntimeBuildSucceeded, 'true')) # ensure all tests are run when the build successed

  #- script: |-
  #    cd ./build/Release
  #    export PYTHONPATH=$PWD
  #    python \
  #      /stage/huggingface-transformers/examples/pytorch/text-classification/run_glue.py \
  #      --model_name_or_path microsoft/deberta-v2-xxlarge \
  #      --task_name MRPC \
  #      --do_train \
  #      --max_seq_length 128 \
  #      --per_device_train_batch_size 4 \
  #      --learning_rate 3e-6 \
  #      --max_steps 260 \
  #      --output_dir ./deberta_res \
  #      --overwrite_output_dir \
  #      --logging_steps 20 \
  #      --fp16 \
  #      --dataloader_num_workers 1 \
  #      --ort \
  #      --skip_memory_metrics
  #  displayName: 'Run Python Hugging-Face DeBERTa-XXL v2 test'
  #  condition: succeededOrFailed() # ensure all tests are run

  #- script: |-
  #    cd ./build/Release
  #    export PYTHONPATH=$PWD
  #    python \
  #      /stage/huggingface-transformers/examples/pytorch/translation/run_translation.py \
  #      --source_prefix '"translate English to Romanian:"' \
  #      --dataset_name wmt16 \
  #      --dataset_config ro-en \
  #      --model_name_or_path t5-large \
  #      --output_dir ./tst-translation \
  #      --do_train \
  #      --label_smoothing 0.1 \
  #      --logging_steps 20 \
  #      --overwrite_output_dir \
  #      --per_device_train_batch_size 16 \
  #      --predict_with_generate \
  #      --source_lang en \
  #      --target_lang ro \
  #      --warmup_steps 5 \
  #      --fp16 \
  #      --max_steps 260 \
  #      --dataloader_num_workers 1 \
  #      --ort \
  #      --skip_memory_metrics
  #    python ../../orttraining/tools/ci_test/compare_huggingface.py \
  #      ci-pipeline-actual.json \
  #      ../../orttraining/tools/ci_test/results/ci-mi100.huggingface.t5-large.json
  #  displayName: 'Run Python Hugging-Face T5-L test'
  #  condition: succeededOrFailed() # ensure all tests are run

  - task: CmdLine@2
    inputs:
      script: |-
        export AZURE_BLOB_SAS_TOKEN="$(onnxruntimetestdata-storage-training-container-sas-token)"
        python orttraining/tools/ci_test/download_azure_blob_archive.py \
          --azure_blob_url https://onnxruntimetestdata.blob.core.windows.net/training/onnxruntime_training_data.zip \
          --target_dir training_e2e_test_data \
          --archive_sha256_digest B01C169B6550D1A0A6F1B4E2F34AE2A8714B52DBB70AC04DA85D371F691BDFF9
    condition: and(succeededOrFailed(), eq(variables.onnxruntimeBuildSucceeded, 'true')) # ensure all tests are run when the build successed
    retryCountOnTaskFailure: 2
    displayName: 'Download onnxruntime_training_data.zip data'

  - task: CmdLine@2
    inputs:
      script: |-
        python orttraining/tools/ci_test/run_batch_size_test.py \
          --binary_dir build/Release \
          --model_root training_e2e_test_data/models \
          --gpu_sku MI100_32G
    displayName: 'Run C++ BERT-L batch size test'
    retryCountOnTaskFailure: 1
    condition: and(succeededOrFailed(), eq(variables.onnxruntimeBuildSucceeded, 'true')) # ensure all tests are run when the build successed

  - task: CmdLine@2
    inputs:
      script: |-
        python orttraining/tools/ci_test/run_bert_perf_test.py \
          --binary_dir build/Release \
          --model_root training_e2e_test_data/models \
          --training_data_root training_e2e_test_data/data \
          --gpu_sku MI100_32G
    displayName: 'Run C++ BERT-L performance test'
    retryCountOnTaskFailure: 1
    condition: and(succeededOrFailed(), eq(variables.onnxruntimeBuildSucceeded, 'true')) # ensure all tests are run when the build successed

  - task: CmdLine@2
    inputs:
      script: |-
        python orttraining/tools/ci_test/run_convergence_test.py \
          --binary_dir build/Release \
          --model_root training_e2e_test_data/models \
          --training_data_root training_e2e_test_data/data \
          --gpu_sku MI100_32G
    displayName: 'Run C++ BERT-L convergence test'
    retryCountOnTaskFailure: 1
    condition: and(succeededOrFailed(), eq(variables.onnxruntimeBuildSucceeded, 'true')) # ensure all tests are run when the build successed

  - script: |
      sudo apt-get update
      sudo apt install -y cifs-utils
    displayName: 'Install filesystems util'
    condition: and(succeededOrFailed(), eq(variables.onnxruntimeBuildSucceeded, 'true'))

  - bash: tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh -p $(orttrainingtestdatascus-storage-key) -s "//orttrainingtestdatascus.file.core.windows.net/mnist" -d "/mnist"
    displayName: 'Mount MNIST'
    condition: and(succeededOrFailed(), eq(variables.onnxruntimeBuildSucceeded, 'true'))

  - bash: tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh -p $(orttrainingtestdatascus-storage-key) -s "//orttrainingtestdatascus.file.core.windows.net/bert-data" -d "/bert_data"
    displayName: 'Mount bert-data'
    condition: and(succeededOrFailed(), eq(variables.onnxruntimeBuildSucceeded, 'true'))

  - bash: tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh -p $(orttrainingtestdatascus-storage-key) -s "//orttrainingtestdatascus.file.core.windows.net/hf-models-cache" -d "/hf_models_cache"
    displayName: 'Mount hf-models-cache'
    condition: and(succeededOrFailed(), eq(variables.onnxruntimeBuildSucceeded, 'true'))


  # Entry point for all ORTModule tests
  # The onnxruntime folder is deleted in the build directory
  # to enforce use of the onnxruntime wheel
  #
  # For every upgrade, we should enable it to test it once, and disable it.
  #
  # TODO - we may consider to cherry pick subset of the UTs to reduce the CI time.
  # We can re-enable this part again when it's done.

  # - task: CmdLine@2
  #   inputs:
  #     script: |-
  #       cd ./build/Release
  #       unset PYTHONPATH
  #       rm -rf onnxruntime
  #       pip install ./dist/onnxruntime*.whl
  #       python -m onnxruntime.training.ortmodule.torch_cpp_extensions.install
  #       python orttraining_ortmodule_tests.py \
  #         --mnist /mnist \
  #         --bert_data /bert_data/hf_data/glue_data/CoLA/original/raw \
  #         --transformers_cache /hf_models_cache/huggingface/transformers
  #   displayName: 'Run orttraining_ortmodule_tests.py'
  #   condition: and(succeededOrFailed(), eq(variables.onnxruntimeBuildSucceeded, 'true'))