[ROCm] Move ROCm build step on CPU only machine (#16596)

- Move ROCm build step on CPU only machine
- Add the performance data of the huggingface bert-large model on the
MI200
- At the beginning of the test step, check the agent's GPU usage and
kill the threads occupying the GPU, which may be left over from previous
tasks that exited abnormally.
- Use different docker images during the build and test steps. The
difference is the `uid` and `user` when build docker image and create
docker container.
This commit is contained in:
PeixuanZuo 2023-07-10 11:55:10 +08:00 committed by GitHub
parent bcebd3b1ca
commit cb4bf4f5c8
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 202 additions and 106 deletions

View file

@ -0,0 +1,57 @@
{
"steps": [
{
"step": 20,
"loss": 2.0017
},
{
"step": 40,
"loss": 1.8337
},
{
"step": 60,
"loss": 1.7538
},
{
"step": 80,
"loss": 1.6728
},
{
"step": 100,
"loss": 1.6656
},
{
"step": 120,
"loss": 1.6752
},
{
"step": 140,
"loss": 1.6335
},
{
"step": 160,
"loss": 1.6815
},
{
"step": 180,
"loss": 1.6155
},
{
"step": 200,
"loss": 1.6177
},
{
"step": 220,
"loss": 1.632
},
{
"step": 240,
"loss": 1.5161
},
{
"step": 260,
"loss": 1.5433
}
],
"samples_per_second": 32.335
}

View file

@ -1,29 +1,27 @@
trigger: none
name: 'orttraining_ci_$(Date:yyyyMMdd)_$(Rev:r)'
variables:
- name: video
value: 44
- name: render
value: 109
- name: RocmVersion
value: 5.5
- name: BuildConfig
value: Release
jobs:
- job: AMD_CI
- job: Linux_Build
variables:
skipComponentGovernanceDetection: true
CCACHE_DIR: $(Pipeline.Workspace)/ccache
TODAY: $[format('{0:dd}{0:MM}{0:yyyy}', pipeline.startTime)]
workspace:
clean: all
pool: 'AMD-GPU'
timeoutInMinutes: 150
# gid of video and render group on gcramdrr1-mi100-085 and -86
variables:
- name: video
value: 44
- name: render
value: 109
- name: onnxruntimeBuildSucceeded
value: false
- name: RocmVersion
value: 5.5
- name: BuildConfig
value: Release
- name: CCACHE_DIR
value: $(Pipeline.Workspace)/ccache
- name: TODAY
value: $[format('{0:dd}{0:MM}{0:yyyy}', pipeline.startTime)]
pool: onnxruntime-Ubuntu2004-AMD-CPU
timeoutInMinutes: 120
steps:
- task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
@ -38,22 +36,13 @@ jobs:
parameters:
Dockerfile: tools/ci_build/github/pai/rocm-ci-pipeline-env.Dockerfile
Context: tools/ci_build/github/linux/docker
DockerBuildArgs: "--build-arg BUILD_UID=$( id -u )"
Repository: onnxruntimetrainingrocm-cibuild-rocm$(RocmVersion)
Repository: onnxruntimetrainingrocm-cibuild-rocm$(RocmVersion)-build
#- script: |-
# sed -i 's|session_options.use_deterministic_compute = False|session_options.use_deterministic_compute = True|g' \
# orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py
# displayName: 'Toggle ON deterministic compute mode for ORTModule'
- task: CmdLine@2
inputs:
script: |-
echo "Select agent: $(Agent.Name), GPU: $HIP_VISIBLE_DEVICES, render: $DRIVER_RENDER"
bash $(Build.SourcesDirectory)/tools/ci_build/github/pai/pai_get_thread.sh $(Agent.Name)
workingDirectory: $(Build.SourcesDirectory)
displayName: 'Check ROCm Environment'
- task: Cache@2
inputs:
key: '"$(TODAY)" | "$(Build.SourceBranch)" | "$(Build.SourceVersion)"'
@ -75,17 +64,13 @@ jobs:
docker run --rm \
--security-opt seccomp=unconfined \
--shm-size=1024m \
--device=/dev/kfd \
--device=/dev/dri/renderD$DRIVER_RENDER \
--group-add $(video) \
--group-add $(render) \
--user onnxruntimedev \
--user $UID:$(id -g $USER) \
--volume $(Build.SourcesDirectory):/onnxruntime_src \
--volume $(Build.BinariesDirectory):/build \
--volume $(CCACHE_DIR):/cache \
-e CCACHE_DIR=/cache \
--workdir /onnxruntime_src \
onnxruntimetrainingrocm-cibuild-rocm$(RocmVersion) \
onnxruntimetrainingrocm-cibuild-rocm$(RocmVersion)-build \
/bin/bash -c "
set -ex; \
ccache -s; \
@ -104,7 +89,7 @@ jobs:
--update \
--build_dir /build \
--build \
--parallel 32 \
--parallel \
--build_wheel \
--skip_submodule_sync \
--skip_tests; \
@ -112,12 +97,49 @@ jobs:
ccache -z"
displayName: 'Build onnxruntime'
- task: PublishPipelineArtifact@0
displayName: 'Publish Pipeline Artifact'
inputs:
artifactName: 'drop-linux'
targetPath: '$(Build.BinariesDirectory)/Release'
- bash: |-
# Do not output ##vso[] commands with `set -x` or they may be parsed again and include a trailing quote.
set +x
echo "##vso[task.setvariable variable=onnxruntimeBuildSucceeded]true"
displayName: 'Set Onnxruntime Build Succeeded'
- template: templates/explicitly-defined-final-tasks.yml
- job: Linux_Test
workspace:
clean: all
pool: AMD-GPU
dependsOn:
- Linux_Build
timeoutInMinutes: 120
steps:
- task: DownloadPipelineArtifact@2
displayName: 'Download Pipeline Artifact'
inputs:
buildType: 'current'
artifactName: 'drop-linux'
targetPath: '$(Build.BinariesDirectory)/Release'
- checkout: self
clean: true
submodules: recursive
- template: templates/get-docker-image-steps.yml
parameters:
Dockerfile: tools/ci_build/github/pai/rocm-ci-pipeline-env.Dockerfile
Context: tools/ci_build/github/linux/docker
DockerBuildArgs: "--build-arg BUILD_UID=$( id -u )"
Repository: onnxruntimetrainingrocm-cibuild-rocm$(RocmVersion)-test
- task: CmdLine@2
inputs:
script: |-
echo "Select agent: $(Agent.Name), GPU: $HIP_VISIBLE_DEVICES, render: $DRIVER_RENDER"
bash $(Build.SourcesDirectory)/tools/ci_build/github/pai/pai_get_thread.sh $(Agent.Name) $HIP_VISIBLE_DEVICES
workingDirectory: $(Build.SourcesDirectory)
displayName: 'Check ROCm Environment'
- task: CmdLine@2
inputs:
@ -133,11 +155,14 @@ jobs:
--volume $(Build.SourcesDirectory):/onnxruntime_src \
--volume $(Build.BinariesDirectory):/build \
--workdir /build/$(BuildConfig) \
onnxruntimetrainingrocm-cibuild-rocm$(RocmVersion) \
/onnxruntime_src/tools/ci_build/github/pai/pai_test_launcher.sh
onnxruntimetrainingrocm-cibuild-rocm$(RocmVersion)-test \
/bin/bash -c "
set -ex; \
chmod a+x /build/Release/onnxruntime_test_all; \
/onnxruntime_src/tools/ci_build/github/pai/pai_test_launcher.sh"
workingDirectory: $(Build.SourcesDirectory)
displayName: 'Run onnxruntime unit tests'
condition: and(succeededOrFailed(), eq(variables.onnxruntimeBuildSucceeded, 'true'))
condition: succeededOrFailed()
- task: CmdLine@2
inputs:
@ -152,14 +177,14 @@ jobs:
--user onnxruntimedev \
--volume $(Build.SourcesDirectory):/onnxruntime_src \
--volume $(Build.BinariesDirectory):/build \
onnxruntimetrainingrocm-cibuild-rocm$(RocmVersion) \
onnxruntimetrainingrocm-cibuild-rocm$(RocmVersion)-test \
/bin/bash -c "
set -ex; \
export KERNEL_EXPLORER_BUILD_DIR=/build/$(BuildConfig); \
pytest /onnxruntime_src/onnxruntime/python/tools/kernel_explorer/ -n 16 --reruns 1"
pytest /onnxruntime_src/onnxruntime/python/tools/kernel_explorer/ -n 8 --reruns 1 --durations=100"
workingDirectory: $(Build.SourcesDirectory)
displayName: 'Run kernel explorer tests'
condition: and(succeededOrFailed(), eq(variables.onnxruntimeBuildSucceeded, 'true'))
condition: succeededOrFailed()
- task: CmdLine@2
inputs:
@ -175,32 +200,15 @@ jobs:
--volume $(Build.SourcesDirectory):/onnxruntime_src \
--volume $(Build.BinariesDirectory):/build \
--workdir /build/$(BuildConfig) \
onnxruntimetrainingrocm-cibuild-rocm$(RocmVersion) \
onnxruntimetrainingrocm-cibuild-rocm$(RocmVersion)-test \
/bin/bash -c "
set -ex; \
export PYTHONPATH=/build/$(BuildConfig); \
python -m onnxruntime.training.ortmodule.torch_cpp_extensions.install; \
python \
/home/onnxruntimedev/huggingface-transformers/examples/pytorch/language-modeling/run_mlm.py \
--model_name_or_path bert-large-uncased \
--dataset_name wikitext \
--dataset_config_name wikitext-2-raw-v1 \
--do_train \
--max_steps 260 \
--logging_steps 20 \
--output_dir ./test-mlm-bbu \
--overwrite_output_dir \
--per_device_train_batch_size 8 \
--fp16 \
--dataloader_num_workers 1 \
--ort \
--skip_memory_metrics; \
python /onnxruntime_src/orttraining/tools/ci_test/compare_huggingface.py \
ci-pipeline-actual.json \
/onnxruntime_src/orttraining/tools/ci_test/results/ci-mi100.huggingface.bert-large-rocm$(RocmVersion).json"
bash /onnxruntime_src/tools/ci_build/github/pai/pai_huggingface_bert_large_test.sh $(RocmVersion)"
workingDirectory: $(Build.SourcesDirectory)
displayName: 'Run Python Hugging-Face BERT-L test'
condition: and(succeededOrFailed(), eq(variables.onnxruntimeBuildSucceeded, 'true'))
condition: succeededOrFailed()
# Entry point for all ORTModule tests
@ -215,38 +223,28 @@ jobs:
whlfilename=$(basename ${files[0]})
echo $whlfilename
docker run --rm \
-e HIP_VISIBLE_DEVICES \
--security-opt seccomp=unconfined \
--shm-size=1024m \
--device=/dev/kfd \
--device=/dev/dri \
--privileged \
--device=/dev/dri/renderD$DRIVER_RENDER \
--group-add $(video) \
--group-add $(render) \
--user onnxruntimedev \
--volume $(Build.BinariesDirectory):/build \
--volume $(Build.SourcesDirectory):/onnxruntime_src \
--workdir /build/$(BuildConfig) \
onnxruntimetrainingrocm-cibuild-rocm$(RocmVersion) \
onnxruntimetrainingrocm-cibuild-rocm$(RocmVersion)-test \
/bin/bash -c "
set -ex; \
unset PYTHONPATH; \
/onnxruntime_src/tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh -p $(orttrainingtestdatascus-storage-key) -s "//orttrainingtestdatascus.file.core.windows.net/mnist" -d "/mnist"; \
/onnxruntime_src/tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh -p $(orttrainingtestdatascus-storage-key) -s "//orttrainingtestdatascus.file.core.windows.net/bert-data" -d "/bert_data"; \
pip install /build/$(BuildConfig)/dist/$whlfilename; \
python -m onnxruntime.training.ortmodule.torch_cpp_extensions.install; \
mkdir /home/onnxruntimedev/mnist /home/onnxruntimedev/bert_data; \
python orttraining_ortmodule_tests.py \
--mnist /mnist \
--bert_data /bert_data/hf_data/glue_data/CoLA/original/raw"
--mnist /home/onnxruntimedev/mnist \
--bert_data /home/onnxruntimedev/bert_data/hf_data/glue_data/CoLA/original/raw"
workingDirectory: $(Build.SourcesDirectory)
displayName: 'Run orttraining_ortmodule_tests.py'
condition: and(succeededOrFailed(), eq(variables.onnxruntimeBuildSucceeded, 'true'))
condition: succeededOrFailed()
- template: templates/component-governance-component-detection-steps.yml
parameters :
condition : 'succeeded'
- script: docker image prune -f
displayName: Clean docker images
condition: eq(variables['Agent.OS'], 'Linux')
continueOnError: true
- template: templates/clean-agent-build-directory-step.yml

View file

@ -1,10 +1,10 @@
#!/bin/bash
#!/bin/bash
set -ex
agentName=$1
finalCharacter=${agentName: -1}
target_device=$2
echo "agent name $agentName"
echo "agent name final character : $finalCharacter"
targetRender=$((finalCharacter+128))
echo "agent target device : $target_device"
echo -e "\n ---- rocm-smi"
rocm-smi
@ -13,19 +13,26 @@ echo -e "\n ---- rocm-smi --showpids"
rocm-smi --showpids
echo -e "\n ---- rocm-smi --showpidgpus"
rocm-smi --showpidgpus
rocm-smi --showpidgpus
echo -e "\n ---- rocm-smi --showpids detail"
rocm-smi --showpids | awk '$1 ~/[0-9]+/{if((NR>6)) {print $1}}' | xargs -I {} ps {}
echo -e "\n ---- rocm-smi --showmeminfo"
rocm-smi --showmeminfo vram vis_vram gtt
rocm-smi --showmeminfo vram vis_vram gtt
echo -e "\n ---- show all renders"
lsof /dev/dri/renderD*
echo -e "\n ---- Clean up the process that is using the target device"
gpu_details=$(rocm-smi --showpidgpus)
pid_lines=$(echo "$gpu_details" | grep -n "DRM device" | cut -d ":" -f 1)
pid_lines_array=($pid_lines)
echo -e "\n ---- show specific render"
lsof /dev/dri/renderD${targetRender}
echo -e "\n ---- show specific render pids detail"
lsof /dev/dri/renderD${targetRender} | grep "mem" | awk '{print $2}' | xargs -I {} ps {}
for ((i = 0; i < ${#pid_lines_array[@]}; i++)); do
pid_line=${pid_lines_array[$i]}
pid=$(echo "$gpu_details" | awk '{print $2}' | sed -n "${pid_line}p")
gpu_line=$((pid_line + 1))
pid_gpu=$(echo "$gpu_details" | sed -n "${gpu_line}p" | sed -e 's/^[ ]*//g' | sed -e 's/[ ]*$//g')
if [ "$pid_gpu" == "$target_device" ]; then
echo "kill pid: $pid, gpu: $pid_gpu"
kill -9 $pid
fi
done

View file

@ -0,0 +1,36 @@
#!/bin/bash
set -ex
rocm_version=$1
mi200_gpus=$(rocm-smi --showproductname | grep -c "MI250")
echo "mi200_gpus: $mi200_gpus"
if [ "$mi200_gpus" -gt "0" ]; then
result_file=ci-mi200.huggingface.bert-large-rocm${rocm_version}.json
else
result_file=ci-mi100.huggingface.bert-large-rocm${rocm_version}.json
fi
python \
/stage/huggingface-transformers/examples/pytorch/language-modeling/run_mlm.py \
--model_name_or_path bert-large-uncased \
--dataset_name wikitext \
--dataset_config_name wikitext-2-raw-v1 \
--do_train \
--max_steps 260 \
--logging_steps 20 \
--output_dir ./test-mlm-bbu \
--overwrite_output_dir \
--per_device_train_batch_size 8 \
--fp16 \
--dataloader_num_workers 1 \
--ort \
--skip_memory_metrics
cat ci-pipeline-actual.json
python /onnxruntime_src/orttraining/tools/ci_test/compare_huggingface.py \
ci-pipeline-actual.json \
/onnxruntime_src/orttraining/tools/ci_test/results/${result_file}

View file

@ -1,11 +1,6 @@
FROM rocm/pytorch:rocm5.5_ubuntu20.04_py3.8_pytorch_1.13.1
ARG BUILD_UID=1001
ARG BUILD_USER=onnxruntimedev
RUN adduser --uid $BUILD_UID $BUILD_USER
RUN echo "$BUILD_USER ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers.d/$BUILD_USER
WORKDIR /home/$BUILD_USER
WORKDIR /stage
# from rocm/pytorch's image, work around ucx's dlopen replacement conflicting with shared provider
RUN cd /opt/mpi_install/ucx/build &&\
@ -29,8 +24,6 @@ RUN mkdir -p /tmp/ccache && \
RUN apt-get update && apt-get install -y cifs-utils
USER $BUILD_USER
# rocm-ci branch contains instrumentation needed for loss curves and perf
RUN git clone https://github.com/microsoft/huggingface-transformers.git &&\
cd huggingface-transformers &&\
@ -59,3 +52,8 @@ RUN pip install \
RUN pip install torch-ort --no-dependencies
ENV ORTMODULE_ONNX_OPSET_VERSION=15
ARG BUILD_UID=1001
ARG BUILD_USER=onnxruntimedev
RUN adduser --uid $BUILD_UID $BUILD_USER
WORKDIR /home/$BUILD_USER