diff --git a/orttraining/tools/ci_test/results/ci-mi200.huggingface.bert-large-rocm5.5.json b/orttraining/tools/ci_test/results/ci-mi200.huggingface.bert-large-rocm5.5.json new file mode 100644 index 0000000000..a4ac02b566 --- /dev/null +++ b/orttraining/tools/ci_test/results/ci-mi200.huggingface.bert-large-rocm5.5.json @@ -0,0 +1,57 @@ +{ + "steps": [ + { + "step": 20, + "loss": 2.0017 + }, + { + "step": 40, + "loss": 1.8337 + }, + { + "step": 60, + "loss": 1.7538 + }, + { + "step": 80, + "loss": 1.6728 + }, + { + "step": 100, + "loss": 1.6656 + }, + { + "step": 120, + "loss": 1.6752 + }, + { + "step": 140, + "loss": 1.6335 + }, + { + "step": 160, + "loss": 1.6815 + }, + { + "step": 180, + "loss": 1.6155 + }, + { + "step": 200, + "loss": 1.6177 + }, + { + "step": 220, + "loss": 1.632 + }, + { + "step": 240, + "loss": 1.5161 + }, + { + "step": 260, + "loss": 1.5433 + } + ], + "samples_per_second": 32.335 +} diff --git a/tools/ci_build/github/azure-pipelines/orttraining-pai-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/orttraining-pai-ci-pipeline.yml index 2dd9ab6e0d..da042bc339 100644 --- a/tools/ci_build/github/azure-pipelines/orttraining-pai-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/orttraining-pai-ci-pipeline.yml @@ -1,29 +1,27 @@ trigger: none name: 'orttraining_ci_$(Date:yyyyMMdd)_$(Rev:r)' + +variables: + - name: video + value: 44 + - name: render + value: 109 + - name: RocmVersion + value: 5.5 + - name: BuildConfig + value: Release + jobs: -- job: AMD_CI +- job: Linux_Build + variables: + skipComponentGovernanceDetection: true + CCACHE_DIR: $(Pipeline.Workspace)/ccache + TODAY: $[format('{0:dd}{0:MM}{0:yyyy}', pipeline.startTime)] workspace: clean: all - pool: 'AMD-GPU' - timeoutInMinutes: 150 - - # gid of video and render group on gcramdrr1-mi100-085 and -86 - variables: - - name: video - value: 44 - - name: render - value: 109 - - name: onnxruntimeBuildSucceeded - value: false - - name: RocmVersion - value: 5.5 - - name: BuildConfig - value: Release - - name: CCACHE_DIR - value: $(Pipeline.Workspace)/ccache - - name: TODAY - value: $[format('{0:dd}{0:MM}{0:yyyy}', pipeline.startTime)] + pool: onnxruntime-Ubuntu2004-AMD-CPU + timeoutInMinutes: 120 steps: - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3 @@ -38,22 +36,13 @@ jobs: parameters: Dockerfile: tools/ci_build/github/pai/rocm-ci-pipeline-env.Dockerfile Context: tools/ci_build/github/linux/docker - DockerBuildArgs: "--build-arg BUILD_UID=$( id -u )" - Repository: onnxruntimetrainingrocm-cibuild-rocm$(RocmVersion) + Repository: onnxruntimetrainingrocm-cibuild-rocm$(RocmVersion)-build #- script: |- # sed -i 's|session_options.use_deterministic_compute = False|session_options.use_deterministic_compute = True|g' \ # orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py # displayName: 'Toggle ON deterministic compute mode for ORTModule' - - task: CmdLine@2 - inputs: - script: |- - echo "Select agent: $(Agent.Name), GPU: $HIP_VISIBLE_DEVICES, render: $DRIVER_RENDER" - bash $(Build.SourcesDirectory)/tools/ci_build/github/pai/pai_get_thread.sh $(Agent.Name) - workingDirectory: $(Build.SourcesDirectory) - displayName: 'Check ROCm Environment' - - task: Cache@2 inputs: key: '"$(TODAY)" | "$(Build.SourceBranch)" | "$(Build.SourceVersion)"' @@ -75,17 +64,13 @@ jobs: docker run --rm \ --security-opt seccomp=unconfined \ --shm-size=1024m \ - --device=/dev/kfd \ - --device=/dev/dri/renderD$DRIVER_RENDER \ - --group-add $(video) \ - --group-add $(render) \ - --user onnxruntimedev \ + --user $UID:$(id -g $USER) \ --volume $(Build.SourcesDirectory):/onnxruntime_src \ --volume $(Build.BinariesDirectory):/build \ --volume $(CCACHE_DIR):/cache \ -e CCACHE_DIR=/cache \ --workdir /onnxruntime_src \ - onnxruntimetrainingrocm-cibuild-rocm$(RocmVersion) \ + onnxruntimetrainingrocm-cibuild-rocm$(RocmVersion)-build \ /bin/bash -c " set -ex; \ ccache -s; \ @@ -104,7 +89,7 @@ jobs: --update \ --build_dir /build \ --build \ - --parallel 32 \ + --parallel \ --build_wheel \ --skip_submodule_sync \ --skip_tests; \ @@ -112,12 +97,49 @@ jobs: ccache -z" displayName: 'Build onnxruntime' + - task: PublishPipelineArtifact@0 + displayName: 'Publish Pipeline Artifact' + inputs: + artifactName: 'drop-linux' + targetPath: '$(Build.BinariesDirectory)/Release' - - bash: |- - # Do not output ##vso[] commands with `set -x` or they may be parsed again and include a trailing quote. - set +x - echo "##vso[task.setvariable variable=onnxruntimeBuildSucceeded]true" - displayName: 'Set Onnxruntime Build Succeeded' + - template: templates/explicitly-defined-final-tasks.yml + + +- job: Linux_Test + workspace: + clean: all + pool: AMD-GPU + dependsOn: + - Linux_Build + timeoutInMinutes: 120 + + steps: + - task: DownloadPipelineArtifact@2 + displayName: 'Download Pipeline Artifact' + inputs: + buildType: 'current' + artifactName: 'drop-linux' + targetPath: '$(Build.BinariesDirectory)/Release' + + - checkout: self + clean: true + submodules: recursive + + - template: templates/get-docker-image-steps.yml + parameters: + Dockerfile: tools/ci_build/github/pai/rocm-ci-pipeline-env.Dockerfile + Context: tools/ci_build/github/linux/docker + DockerBuildArgs: "--build-arg BUILD_UID=$( id -u )" + Repository: onnxruntimetrainingrocm-cibuild-rocm$(RocmVersion)-test + + - task: CmdLine@2 + inputs: + script: |- + echo "Select agent: $(Agent.Name), GPU: $HIP_VISIBLE_DEVICES, render: $DRIVER_RENDER" + bash $(Build.SourcesDirectory)/tools/ci_build/github/pai/pai_get_thread.sh $(Agent.Name) $HIP_VISIBLE_DEVICES + workingDirectory: $(Build.SourcesDirectory) + displayName: 'Check ROCm Environment' - task: CmdLine@2 inputs: @@ -133,11 +155,14 @@ jobs: --volume $(Build.SourcesDirectory):/onnxruntime_src \ --volume $(Build.BinariesDirectory):/build \ --workdir /build/$(BuildConfig) \ - onnxruntimetrainingrocm-cibuild-rocm$(RocmVersion) \ - /onnxruntime_src/tools/ci_build/github/pai/pai_test_launcher.sh + onnxruntimetrainingrocm-cibuild-rocm$(RocmVersion)-test \ + /bin/bash -c " + set -ex; \ + chmod a+x /build/Release/onnxruntime_test_all; \ + /onnxruntime_src/tools/ci_build/github/pai/pai_test_launcher.sh" workingDirectory: $(Build.SourcesDirectory) displayName: 'Run onnxruntime unit tests' - condition: and(succeededOrFailed(), eq(variables.onnxruntimeBuildSucceeded, 'true')) + condition: succeededOrFailed() - task: CmdLine@2 inputs: @@ -152,14 +177,14 @@ jobs: --user onnxruntimedev \ --volume $(Build.SourcesDirectory):/onnxruntime_src \ --volume $(Build.BinariesDirectory):/build \ - onnxruntimetrainingrocm-cibuild-rocm$(RocmVersion) \ + onnxruntimetrainingrocm-cibuild-rocm$(RocmVersion)-test \ /bin/bash -c " set -ex; \ export KERNEL_EXPLORER_BUILD_DIR=/build/$(BuildConfig); \ - pytest /onnxruntime_src/onnxruntime/python/tools/kernel_explorer/ -n 16 --reruns 1" + pytest /onnxruntime_src/onnxruntime/python/tools/kernel_explorer/ -n 8 --reruns 1 --durations=100" workingDirectory: $(Build.SourcesDirectory) displayName: 'Run kernel explorer tests' - condition: and(succeededOrFailed(), eq(variables.onnxruntimeBuildSucceeded, 'true')) + condition: succeededOrFailed() - task: CmdLine@2 inputs: @@ -175,32 +200,15 @@ jobs: --volume $(Build.SourcesDirectory):/onnxruntime_src \ --volume $(Build.BinariesDirectory):/build \ --workdir /build/$(BuildConfig) \ - onnxruntimetrainingrocm-cibuild-rocm$(RocmVersion) \ + onnxruntimetrainingrocm-cibuild-rocm$(RocmVersion)-test \ /bin/bash -c " set -ex; \ export PYTHONPATH=/build/$(BuildConfig); \ python -m onnxruntime.training.ortmodule.torch_cpp_extensions.install; \ - python \ - /home/onnxruntimedev/huggingface-transformers/examples/pytorch/language-modeling/run_mlm.py \ - --model_name_or_path bert-large-uncased \ - --dataset_name wikitext \ - --dataset_config_name wikitext-2-raw-v1 \ - --do_train \ - --max_steps 260 \ - --logging_steps 20 \ - --output_dir ./test-mlm-bbu \ - --overwrite_output_dir \ - --per_device_train_batch_size 8 \ - --fp16 \ - --dataloader_num_workers 1 \ - --ort \ - --skip_memory_metrics; \ - python /onnxruntime_src/orttraining/tools/ci_test/compare_huggingface.py \ - ci-pipeline-actual.json \ - /onnxruntime_src/orttraining/tools/ci_test/results/ci-mi100.huggingface.bert-large-rocm$(RocmVersion).json" + bash /onnxruntime_src/tools/ci_build/github/pai/pai_huggingface_bert_large_test.sh $(RocmVersion)" workingDirectory: $(Build.SourcesDirectory) displayName: 'Run Python Hugging-Face BERT-L test' - condition: and(succeededOrFailed(), eq(variables.onnxruntimeBuildSucceeded, 'true')) + condition: succeededOrFailed() # Entry point for all ORTModule tests @@ -215,38 +223,28 @@ jobs: whlfilename=$(basename ${files[0]}) echo $whlfilename docker run --rm \ - -e HIP_VISIBLE_DEVICES \ --security-opt seccomp=unconfined \ --shm-size=1024m \ --device=/dev/kfd \ - --device=/dev/dri \ - --privileged \ + --device=/dev/dri/renderD$DRIVER_RENDER \ --group-add $(video) \ --group-add $(render) \ --user onnxruntimedev \ --volume $(Build.BinariesDirectory):/build \ --volume $(Build.SourcesDirectory):/onnxruntime_src \ --workdir /build/$(BuildConfig) \ - onnxruntimetrainingrocm-cibuild-rocm$(RocmVersion) \ + onnxruntimetrainingrocm-cibuild-rocm$(RocmVersion)-test \ /bin/bash -c " set -ex; \ unset PYTHONPATH; \ - /onnxruntime_src/tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh -p $(orttrainingtestdatascus-storage-key) -s "//orttrainingtestdatascus.file.core.windows.net/mnist" -d "/mnist"; \ - /onnxruntime_src/tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh -p $(orttrainingtestdatascus-storage-key) -s "//orttrainingtestdatascus.file.core.windows.net/bert-data" -d "/bert_data"; \ pip install /build/$(BuildConfig)/dist/$whlfilename; \ python -m onnxruntime.training.ortmodule.torch_cpp_extensions.install; \ + mkdir /home/onnxruntimedev/mnist /home/onnxruntimedev/bert_data; \ python orttraining_ortmodule_tests.py \ - --mnist /mnist \ - --bert_data /bert_data/hf_data/glue_data/CoLA/original/raw" + --mnist /home/onnxruntimedev/mnist \ + --bert_data /home/onnxruntimedev/bert_data/hf_data/glue_data/CoLA/original/raw" workingDirectory: $(Build.SourcesDirectory) displayName: 'Run orttraining_ortmodule_tests.py' - condition: and(succeededOrFailed(), eq(variables.onnxruntimeBuildSucceeded, 'true')) + condition: succeededOrFailed() - - template: templates/component-governance-component-detection-steps.yml - parameters : - condition : 'succeeded' - - - script: docker image prune -f - displayName: Clean docker images - condition: eq(variables['Agent.OS'], 'Linux') - continueOnError: true + - template: templates/clean-agent-build-directory-step.yml diff --git a/tools/ci_build/github/pai/pai_get_thread.sh b/tools/ci_build/github/pai/pai_get_thread.sh index 5e2811d579..4ebbcf4920 100755 --- a/tools/ci_build/github/pai/pai_get_thread.sh +++ b/tools/ci_build/github/pai/pai_get_thread.sh @@ -1,10 +1,10 @@ -#!/bin/bash +#!/bin/bash +set -ex agentName=$1 -finalCharacter=${agentName: -1} +target_device=$2 echo "agent name $agentName" -echo "agent name final character : $finalCharacter" -targetRender=$((finalCharacter+128)) +echo "agent target device : $target_device" echo -e "\n ---- rocm-smi" rocm-smi @@ -13,19 +13,26 @@ echo -e "\n ---- rocm-smi --showpids" rocm-smi --showpids echo -e "\n ---- rocm-smi --showpidgpus" -rocm-smi --showpidgpus +rocm-smi --showpidgpus echo -e "\n ---- rocm-smi --showpids detail" rocm-smi --showpids | awk '$1 ~/[0-9]+/{if((NR>6)) {print $1}}' | xargs -I {} ps {} echo -e "\n ---- rocm-smi --showmeminfo" -rocm-smi --showmeminfo vram vis_vram gtt +rocm-smi --showmeminfo vram vis_vram gtt -echo -e "\n ---- show all renders" -lsof /dev/dri/renderD* +echo -e "\n ---- Clean up the process that is using the target device" +gpu_details=$(rocm-smi --showpidgpus) +pid_lines=$(echo "$gpu_details" | grep -n "DRM device" | cut -d ":" -f 1) +pid_lines_array=($pid_lines) -echo -e "\n ---- show specific render" -lsof /dev/dri/renderD${targetRender} - -echo -e "\n ---- show specific render pids detail" -lsof /dev/dri/renderD${targetRender} | grep "mem" | awk '{print $2}' | xargs -I {} ps {} \ No newline at end of file +for ((i = 0; i < ${#pid_lines_array[@]}; i++)); do + pid_line=${pid_lines_array[$i]} + pid=$(echo "$gpu_details" | awk '{print $2}' | sed -n "${pid_line}p") + gpu_line=$((pid_line + 1)) + pid_gpu=$(echo "$gpu_details" | sed -n "${gpu_line}p" | sed -e 's/^[ ]*//g' | sed -e 's/[ ]*$//g') + if [ "$pid_gpu" == "$target_device" ]; then + echo "kill pid: $pid, gpu: $pid_gpu" + kill -9 $pid + fi +done diff --git a/tools/ci_build/github/pai/pai_huggingface_bert_large_test.sh b/tools/ci_build/github/pai/pai_huggingface_bert_large_test.sh new file mode 100644 index 0000000000..ee1f048212 --- /dev/null +++ b/tools/ci_build/github/pai/pai_huggingface_bert_large_test.sh @@ -0,0 +1,36 @@ +#!/bin/bash + +set -ex + +rocm_version=$1 +mi200_gpus=$(rocm-smi --showproductname | grep -c "MI250") + +echo "mi200_gpus: $mi200_gpus" + +if [ "$mi200_gpus" -gt "0" ]; then + result_file=ci-mi200.huggingface.bert-large-rocm${rocm_version}.json +else + result_file=ci-mi100.huggingface.bert-large-rocm${rocm_version}.json +fi + +python \ + /stage/huggingface-transformers/examples/pytorch/language-modeling/run_mlm.py \ + --model_name_or_path bert-large-uncased \ + --dataset_name wikitext \ + --dataset_config_name wikitext-2-raw-v1 \ + --do_train \ + --max_steps 260 \ + --logging_steps 20 \ + --output_dir ./test-mlm-bbu \ + --overwrite_output_dir \ + --per_device_train_batch_size 8 \ + --fp16 \ + --dataloader_num_workers 1 \ + --ort \ + --skip_memory_metrics + +cat ci-pipeline-actual.json + +python /onnxruntime_src/orttraining/tools/ci_test/compare_huggingface.py \ + ci-pipeline-actual.json \ + /onnxruntime_src/orttraining/tools/ci_test/results/${result_file} diff --git a/tools/ci_build/github/pai/rocm-ci-pipeline-env.Dockerfile b/tools/ci_build/github/pai/rocm-ci-pipeline-env.Dockerfile index 23cd963056..9fa334767c 100644 --- a/tools/ci_build/github/pai/rocm-ci-pipeline-env.Dockerfile +++ b/tools/ci_build/github/pai/rocm-ci-pipeline-env.Dockerfile @@ -1,11 +1,6 @@ FROM rocm/pytorch:rocm5.5_ubuntu20.04_py3.8_pytorch_1.13.1 -ARG BUILD_UID=1001 -ARG BUILD_USER=onnxruntimedev -RUN adduser --uid $BUILD_UID $BUILD_USER -RUN echo "$BUILD_USER ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers.d/$BUILD_USER - -WORKDIR /home/$BUILD_USER +WORKDIR /stage # from rocm/pytorch's image, work around ucx's dlopen replacement conflicting with shared provider RUN cd /opt/mpi_install/ucx/build &&\ @@ -29,8 +24,6 @@ RUN mkdir -p /tmp/ccache && \ RUN apt-get update && apt-get install -y cifs-utils -USER $BUILD_USER - # rocm-ci branch contains instrumentation needed for loss curves and perf RUN git clone https://github.com/microsoft/huggingface-transformers.git &&\ cd huggingface-transformers &&\ @@ -59,3 +52,8 @@ RUN pip install \ RUN pip install torch-ort --no-dependencies ENV ORTMODULE_ONNX_OPSET_VERSION=15 + +ARG BUILD_UID=1001 +ARG BUILD_USER=onnxruntimedev +RUN adduser --uid $BUILD_UID $BUILD_USER +WORKDIR /home/$BUILD_USER