mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-05-28 22:56:32 +00:00
[ROCm] Move ROCm build step on CPU only machine (#16596)
- Move ROCm build step on CPU only machine - Add the performance data of the huggingface bert-large model on the MI200 - At the beginning of the test step, check the agent's GPU usage and kill the threads occupying the GPU, which may be left over from previous tasks that exited abnormally. - Use different docker images during the build and test steps. The difference is the `uid` and `user` when build docker image and create docker container.
This commit is contained in:
parent
bcebd3b1ca
commit
cb4bf4f5c8
5 changed files with 202 additions and 106 deletions
|
|
@ -0,0 +1,57 @@
|
|||
{
|
||||
"steps": [
|
||||
{
|
||||
"step": 20,
|
||||
"loss": 2.0017
|
||||
},
|
||||
{
|
||||
"step": 40,
|
||||
"loss": 1.8337
|
||||
},
|
||||
{
|
||||
"step": 60,
|
||||
"loss": 1.7538
|
||||
},
|
||||
{
|
||||
"step": 80,
|
||||
"loss": 1.6728
|
||||
},
|
||||
{
|
||||
"step": 100,
|
||||
"loss": 1.6656
|
||||
},
|
||||
{
|
||||
"step": 120,
|
||||
"loss": 1.6752
|
||||
},
|
||||
{
|
||||
"step": 140,
|
||||
"loss": 1.6335
|
||||
},
|
||||
{
|
||||
"step": 160,
|
||||
"loss": 1.6815
|
||||
},
|
||||
{
|
||||
"step": 180,
|
||||
"loss": 1.6155
|
||||
},
|
||||
{
|
||||
"step": 200,
|
||||
"loss": 1.6177
|
||||
},
|
||||
{
|
||||
"step": 220,
|
||||
"loss": 1.632
|
||||
},
|
||||
{
|
||||
"step": 240,
|
||||
"loss": 1.5161
|
||||
},
|
||||
{
|
||||
"step": 260,
|
||||
"loss": 1.5433
|
||||
}
|
||||
],
|
||||
"samples_per_second": 32.335
|
||||
}
|
||||
|
|
@ -1,29 +1,27 @@
|
|||
trigger: none
|
||||
|
||||
name: 'orttraining_ci_$(Date:yyyyMMdd)_$(Rev:r)'
|
||||
|
||||
variables:
|
||||
- name: video
|
||||
value: 44
|
||||
- name: render
|
||||
value: 109
|
||||
- name: RocmVersion
|
||||
value: 5.5
|
||||
- name: BuildConfig
|
||||
value: Release
|
||||
|
||||
jobs:
|
||||
- job: AMD_CI
|
||||
- job: Linux_Build
|
||||
variables:
|
||||
skipComponentGovernanceDetection: true
|
||||
CCACHE_DIR: $(Pipeline.Workspace)/ccache
|
||||
TODAY: $[format('{0:dd}{0:MM}{0:yyyy}', pipeline.startTime)]
|
||||
workspace:
|
||||
clean: all
|
||||
pool: 'AMD-GPU'
|
||||
timeoutInMinutes: 150
|
||||
|
||||
# gid of video and render group on gcramdrr1-mi100-085 and -86
|
||||
variables:
|
||||
- name: video
|
||||
value: 44
|
||||
- name: render
|
||||
value: 109
|
||||
- name: onnxruntimeBuildSucceeded
|
||||
value: false
|
||||
- name: RocmVersion
|
||||
value: 5.5
|
||||
- name: BuildConfig
|
||||
value: Release
|
||||
- name: CCACHE_DIR
|
||||
value: $(Pipeline.Workspace)/ccache
|
||||
- name: TODAY
|
||||
value: $[format('{0:dd}{0:MM}{0:yyyy}', pipeline.startTime)]
|
||||
pool: onnxruntime-Ubuntu2004-AMD-CPU
|
||||
timeoutInMinutes: 120
|
||||
|
||||
steps:
|
||||
- task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
|
||||
|
|
@ -38,22 +36,13 @@ jobs:
|
|||
parameters:
|
||||
Dockerfile: tools/ci_build/github/pai/rocm-ci-pipeline-env.Dockerfile
|
||||
Context: tools/ci_build/github/linux/docker
|
||||
DockerBuildArgs: "--build-arg BUILD_UID=$( id -u )"
|
||||
Repository: onnxruntimetrainingrocm-cibuild-rocm$(RocmVersion)
|
||||
Repository: onnxruntimetrainingrocm-cibuild-rocm$(RocmVersion)-build
|
||||
|
||||
#- script: |-
|
||||
# sed -i 's|session_options.use_deterministic_compute = False|session_options.use_deterministic_compute = True|g' \
|
||||
# orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py
|
||||
# displayName: 'Toggle ON deterministic compute mode for ORTModule'
|
||||
|
||||
- task: CmdLine@2
|
||||
inputs:
|
||||
script: |-
|
||||
echo "Select agent: $(Agent.Name), GPU: $HIP_VISIBLE_DEVICES, render: $DRIVER_RENDER"
|
||||
bash $(Build.SourcesDirectory)/tools/ci_build/github/pai/pai_get_thread.sh $(Agent.Name)
|
||||
workingDirectory: $(Build.SourcesDirectory)
|
||||
displayName: 'Check ROCm Environment'
|
||||
|
||||
- task: Cache@2
|
||||
inputs:
|
||||
key: '"$(TODAY)" | "$(Build.SourceBranch)" | "$(Build.SourceVersion)"'
|
||||
|
|
@ -75,17 +64,13 @@ jobs:
|
|||
docker run --rm \
|
||||
--security-opt seccomp=unconfined \
|
||||
--shm-size=1024m \
|
||||
--device=/dev/kfd \
|
||||
--device=/dev/dri/renderD$DRIVER_RENDER \
|
||||
--group-add $(video) \
|
||||
--group-add $(render) \
|
||||
--user onnxruntimedev \
|
||||
--user $UID:$(id -g $USER) \
|
||||
--volume $(Build.SourcesDirectory):/onnxruntime_src \
|
||||
--volume $(Build.BinariesDirectory):/build \
|
||||
--volume $(CCACHE_DIR):/cache \
|
||||
-e CCACHE_DIR=/cache \
|
||||
--workdir /onnxruntime_src \
|
||||
onnxruntimetrainingrocm-cibuild-rocm$(RocmVersion) \
|
||||
onnxruntimetrainingrocm-cibuild-rocm$(RocmVersion)-build \
|
||||
/bin/bash -c "
|
||||
set -ex; \
|
||||
ccache -s; \
|
||||
|
|
@ -104,7 +89,7 @@ jobs:
|
|||
--update \
|
||||
--build_dir /build \
|
||||
--build \
|
||||
--parallel 32 \
|
||||
--parallel \
|
||||
--build_wheel \
|
||||
--skip_submodule_sync \
|
||||
--skip_tests; \
|
||||
|
|
@ -112,12 +97,49 @@ jobs:
|
|||
ccache -z"
|
||||
displayName: 'Build onnxruntime'
|
||||
|
||||
- task: PublishPipelineArtifact@0
|
||||
displayName: 'Publish Pipeline Artifact'
|
||||
inputs:
|
||||
artifactName: 'drop-linux'
|
||||
targetPath: '$(Build.BinariesDirectory)/Release'
|
||||
|
||||
- bash: |-
|
||||
# Do not output ##vso[] commands with `set -x` or they may be parsed again and include a trailing quote.
|
||||
set +x
|
||||
echo "##vso[task.setvariable variable=onnxruntimeBuildSucceeded]true"
|
||||
displayName: 'Set Onnxruntime Build Succeeded'
|
||||
- template: templates/explicitly-defined-final-tasks.yml
|
||||
|
||||
|
||||
- job: Linux_Test
|
||||
workspace:
|
||||
clean: all
|
||||
pool: AMD-GPU
|
||||
dependsOn:
|
||||
- Linux_Build
|
||||
timeoutInMinutes: 120
|
||||
|
||||
steps:
|
||||
- task: DownloadPipelineArtifact@2
|
||||
displayName: 'Download Pipeline Artifact'
|
||||
inputs:
|
||||
buildType: 'current'
|
||||
artifactName: 'drop-linux'
|
||||
targetPath: '$(Build.BinariesDirectory)/Release'
|
||||
|
||||
- checkout: self
|
||||
clean: true
|
||||
submodules: recursive
|
||||
|
||||
- template: templates/get-docker-image-steps.yml
|
||||
parameters:
|
||||
Dockerfile: tools/ci_build/github/pai/rocm-ci-pipeline-env.Dockerfile
|
||||
Context: tools/ci_build/github/linux/docker
|
||||
DockerBuildArgs: "--build-arg BUILD_UID=$( id -u )"
|
||||
Repository: onnxruntimetrainingrocm-cibuild-rocm$(RocmVersion)-test
|
||||
|
||||
- task: CmdLine@2
|
||||
inputs:
|
||||
script: |-
|
||||
echo "Select agent: $(Agent.Name), GPU: $HIP_VISIBLE_DEVICES, render: $DRIVER_RENDER"
|
||||
bash $(Build.SourcesDirectory)/tools/ci_build/github/pai/pai_get_thread.sh $(Agent.Name) $HIP_VISIBLE_DEVICES
|
||||
workingDirectory: $(Build.SourcesDirectory)
|
||||
displayName: 'Check ROCm Environment'
|
||||
|
||||
- task: CmdLine@2
|
||||
inputs:
|
||||
|
|
@ -133,11 +155,14 @@ jobs:
|
|||
--volume $(Build.SourcesDirectory):/onnxruntime_src \
|
||||
--volume $(Build.BinariesDirectory):/build \
|
||||
--workdir /build/$(BuildConfig) \
|
||||
onnxruntimetrainingrocm-cibuild-rocm$(RocmVersion) \
|
||||
/onnxruntime_src/tools/ci_build/github/pai/pai_test_launcher.sh
|
||||
onnxruntimetrainingrocm-cibuild-rocm$(RocmVersion)-test \
|
||||
/bin/bash -c "
|
||||
set -ex; \
|
||||
chmod a+x /build/Release/onnxruntime_test_all; \
|
||||
/onnxruntime_src/tools/ci_build/github/pai/pai_test_launcher.sh"
|
||||
workingDirectory: $(Build.SourcesDirectory)
|
||||
displayName: 'Run onnxruntime unit tests'
|
||||
condition: and(succeededOrFailed(), eq(variables.onnxruntimeBuildSucceeded, 'true'))
|
||||
condition: succeededOrFailed()
|
||||
|
||||
- task: CmdLine@2
|
||||
inputs:
|
||||
|
|
@ -152,14 +177,14 @@ jobs:
|
|||
--user onnxruntimedev \
|
||||
--volume $(Build.SourcesDirectory):/onnxruntime_src \
|
||||
--volume $(Build.BinariesDirectory):/build \
|
||||
onnxruntimetrainingrocm-cibuild-rocm$(RocmVersion) \
|
||||
onnxruntimetrainingrocm-cibuild-rocm$(RocmVersion)-test \
|
||||
/bin/bash -c "
|
||||
set -ex; \
|
||||
export KERNEL_EXPLORER_BUILD_DIR=/build/$(BuildConfig); \
|
||||
pytest /onnxruntime_src/onnxruntime/python/tools/kernel_explorer/ -n 16 --reruns 1"
|
||||
pytest /onnxruntime_src/onnxruntime/python/tools/kernel_explorer/ -n 8 --reruns 1 --durations=100"
|
||||
workingDirectory: $(Build.SourcesDirectory)
|
||||
displayName: 'Run kernel explorer tests'
|
||||
condition: and(succeededOrFailed(), eq(variables.onnxruntimeBuildSucceeded, 'true'))
|
||||
condition: succeededOrFailed()
|
||||
|
||||
- task: CmdLine@2
|
||||
inputs:
|
||||
|
|
@ -175,32 +200,15 @@ jobs:
|
|||
--volume $(Build.SourcesDirectory):/onnxruntime_src \
|
||||
--volume $(Build.BinariesDirectory):/build \
|
||||
--workdir /build/$(BuildConfig) \
|
||||
onnxruntimetrainingrocm-cibuild-rocm$(RocmVersion) \
|
||||
onnxruntimetrainingrocm-cibuild-rocm$(RocmVersion)-test \
|
||||
/bin/bash -c "
|
||||
set -ex; \
|
||||
export PYTHONPATH=/build/$(BuildConfig); \
|
||||
python -m onnxruntime.training.ortmodule.torch_cpp_extensions.install; \
|
||||
python \
|
||||
/home/onnxruntimedev/huggingface-transformers/examples/pytorch/language-modeling/run_mlm.py \
|
||||
--model_name_or_path bert-large-uncased \
|
||||
--dataset_name wikitext \
|
||||
--dataset_config_name wikitext-2-raw-v1 \
|
||||
--do_train \
|
||||
--max_steps 260 \
|
||||
--logging_steps 20 \
|
||||
--output_dir ./test-mlm-bbu \
|
||||
--overwrite_output_dir \
|
||||
--per_device_train_batch_size 8 \
|
||||
--fp16 \
|
||||
--dataloader_num_workers 1 \
|
||||
--ort \
|
||||
--skip_memory_metrics; \
|
||||
python /onnxruntime_src/orttraining/tools/ci_test/compare_huggingface.py \
|
||||
ci-pipeline-actual.json \
|
||||
/onnxruntime_src/orttraining/tools/ci_test/results/ci-mi100.huggingface.bert-large-rocm$(RocmVersion).json"
|
||||
bash /onnxruntime_src/tools/ci_build/github/pai/pai_huggingface_bert_large_test.sh $(RocmVersion)"
|
||||
workingDirectory: $(Build.SourcesDirectory)
|
||||
displayName: 'Run Python Hugging-Face BERT-L test'
|
||||
condition: and(succeededOrFailed(), eq(variables.onnxruntimeBuildSucceeded, 'true'))
|
||||
condition: succeededOrFailed()
|
||||
|
||||
|
||||
# Entry point for all ORTModule tests
|
||||
|
|
@ -215,38 +223,28 @@ jobs:
|
|||
whlfilename=$(basename ${files[0]})
|
||||
echo $whlfilename
|
||||
docker run --rm \
|
||||
-e HIP_VISIBLE_DEVICES \
|
||||
--security-opt seccomp=unconfined \
|
||||
--shm-size=1024m \
|
||||
--device=/dev/kfd \
|
||||
--device=/dev/dri \
|
||||
--privileged \
|
||||
--device=/dev/dri/renderD$DRIVER_RENDER \
|
||||
--group-add $(video) \
|
||||
--group-add $(render) \
|
||||
--user onnxruntimedev \
|
||||
--volume $(Build.BinariesDirectory):/build \
|
||||
--volume $(Build.SourcesDirectory):/onnxruntime_src \
|
||||
--workdir /build/$(BuildConfig) \
|
||||
onnxruntimetrainingrocm-cibuild-rocm$(RocmVersion) \
|
||||
onnxruntimetrainingrocm-cibuild-rocm$(RocmVersion)-test \
|
||||
/bin/bash -c "
|
||||
set -ex; \
|
||||
unset PYTHONPATH; \
|
||||
/onnxruntime_src/tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh -p $(orttrainingtestdatascus-storage-key) -s "//orttrainingtestdatascus.file.core.windows.net/mnist" -d "/mnist"; \
|
||||
/onnxruntime_src/tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh -p $(orttrainingtestdatascus-storage-key) -s "//orttrainingtestdatascus.file.core.windows.net/bert-data" -d "/bert_data"; \
|
||||
pip install /build/$(BuildConfig)/dist/$whlfilename; \
|
||||
python -m onnxruntime.training.ortmodule.torch_cpp_extensions.install; \
|
||||
mkdir /home/onnxruntimedev/mnist /home/onnxruntimedev/bert_data; \
|
||||
python orttraining_ortmodule_tests.py \
|
||||
--mnist /mnist \
|
||||
--bert_data /bert_data/hf_data/glue_data/CoLA/original/raw"
|
||||
--mnist /home/onnxruntimedev/mnist \
|
||||
--bert_data /home/onnxruntimedev/bert_data/hf_data/glue_data/CoLA/original/raw"
|
||||
workingDirectory: $(Build.SourcesDirectory)
|
||||
displayName: 'Run orttraining_ortmodule_tests.py'
|
||||
condition: and(succeededOrFailed(), eq(variables.onnxruntimeBuildSucceeded, 'true'))
|
||||
condition: succeededOrFailed()
|
||||
|
||||
- template: templates/component-governance-component-detection-steps.yml
|
||||
parameters :
|
||||
condition : 'succeeded'
|
||||
|
||||
- script: docker image prune -f
|
||||
displayName: Clean docker images
|
||||
condition: eq(variables['Agent.OS'], 'Linux')
|
||||
continueOnError: true
|
||||
- template: templates/clean-agent-build-directory-step.yml
|
||||
|
|
|
|||
|
|
@ -1,10 +1,10 @@
|
|||
#!/bin/bash
|
||||
#!/bin/bash
|
||||
set -ex
|
||||
|
||||
agentName=$1
|
||||
finalCharacter=${agentName: -1}
|
||||
target_device=$2
|
||||
echo "agent name $agentName"
|
||||
echo "agent name final character : $finalCharacter"
|
||||
targetRender=$((finalCharacter+128))
|
||||
echo "agent target device : $target_device"
|
||||
|
||||
echo -e "\n ---- rocm-smi"
|
||||
rocm-smi
|
||||
|
|
@ -13,19 +13,26 @@ echo -e "\n ---- rocm-smi --showpids"
|
|||
rocm-smi --showpids
|
||||
|
||||
echo -e "\n ---- rocm-smi --showpidgpus"
|
||||
rocm-smi --showpidgpus
|
||||
rocm-smi --showpidgpus
|
||||
|
||||
echo -e "\n ---- rocm-smi --showpids detail"
|
||||
rocm-smi --showpids | awk '$1 ~/[0-9]+/{if((NR>6)) {print $1}}' | xargs -I {} ps {}
|
||||
|
||||
echo -e "\n ---- rocm-smi --showmeminfo"
|
||||
rocm-smi --showmeminfo vram vis_vram gtt
|
||||
rocm-smi --showmeminfo vram vis_vram gtt
|
||||
|
||||
echo -e "\n ---- show all renders"
|
||||
lsof /dev/dri/renderD*
|
||||
echo -e "\n ---- Clean up the process that is using the target device"
|
||||
gpu_details=$(rocm-smi --showpidgpus)
|
||||
pid_lines=$(echo "$gpu_details" | grep -n "DRM device" | cut -d ":" -f 1)
|
||||
pid_lines_array=($pid_lines)
|
||||
|
||||
echo -e "\n ---- show specific render"
|
||||
lsof /dev/dri/renderD${targetRender}
|
||||
|
||||
echo -e "\n ---- show specific render pids detail"
|
||||
lsof /dev/dri/renderD${targetRender} | grep "mem" | awk '{print $2}' | xargs -I {} ps {}
|
||||
for ((i = 0; i < ${#pid_lines_array[@]}; i++)); do
|
||||
pid_line=${pid_lines_array[$i]}
|
||||
pid=$(echo "$gpu_details" | awk '{print $2}' | sed -n "${pid_line}p")
|
||||
gpu_line=$((pid_line + 1))
|
||||
pid_gpu=$(echo "$gpu_details" | sed -n "${gpu_line}p" | sed -e 's/^[ ]*//g' | sed -e 's/[ ]*$//g')
|
||||
if [ "$pid_gpu" == "$target_device" ]; then
|
||||
echo "kill pid: $pid, gpu: $pid_gpu"
|
||||
kill -9 $pid
|
||||
fi
|
||||
done
|
||||
|
|
|
|||
36
tools/ci_build/github/pai/pai_huggingface_bert_large_test.sh
Normal file
36
tools/ci_build/github/pai/pai_huggingface_bert_large_test.sh
Normal file
|
|
@ -0,0 +1,36 @@
|
|||
#!/bin/bash
|
||||
|
||||
set -ex
|
||||
|
||||
rocm_version=$1
|
||||
mi200_gpus=$(rocm-smi --showproductname | grep -c "MI250")
|
||||
|
||||
echo "mi200_gpus: $mi200_gpus"
|
||||
|
||||
if [ "$mi200_gpus" -gt "0" ]; then
|
||||
result_file=ci-mi200.huggingface.bert-large-rocm${rocm_version}.json
|
||||
else
|
||||
result_file=ci-mi100.huggingface.bert-large-rocm${rocm_version}.json
|
||||
fi
|
||||
|
||||
python \
|
||||
/stage/huggingface-transformers/examples/pytorch/language-modeling/run_mlm.py \
|
||||
--model_name_or_path bert-large-uncased \
|
||||
--dataset_name wikitext \
|
||||
--dataset_config_name wikitext-2-raw-v1 \
|
||||
--do_train \
|
||||
--max_steps 260 \
|
||||
--logging_steps 20 \
|
||||
--output_dir ./test-mlm-bbu \
|
||||
--overwrite_output_dir \
|
||||
--per_device_train_batch_size 8 \
|
||||
--fp16 \
|
||||
--dataloader_num_workers 1 \
|
||||
--ort \
|
||||
--skip_memory_metrics
|
||||
|
||||
cat ci-pipeline-actual.json
|
||||
|
||||
python /onnxruntime_src/orttraining/tools/ci_test/compare_huggingface.py \
|
||||
ci-pipeline-actual.json \
|
||||
/onnxruntime_src/orttraining/tools/ci_test/results/${result_file}
|
||||
|
|
@ -1,11 +1,6 @@
|
|||
FROM rocm/pytorch:rocm5.5_ubuntu20.04_py3.8_pytorch_1.13.1
|
||||
|
||||
ARG BUILD_UID=1001
|
||||
ARG BUILD_USER=onnxruntimedev
|
||||
RUN adduser --uid $BUILD_UID $BUILD_USER
|
||||
RUN echo "$BUILD_USER ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers.d/$BUILD_USER
|
||||
|
||||
WORKDIR /home/$BUILD_USER
|
||||
WORKDIR /stage
|
||||
|
||||
# from rocm/pytorch's image, work around ucx's dlopen replacement conflicting with shared provider
|
||||
RUN cd /opt/mpi_install/ucx/build &&\
|
||||
|
|
@ -29,8 +24,6 @@ RUN mkdir -p /tmp/ccache && \
|
|||
|
||||
RUN apt-get update && apt-get install -y cifs-utils
|
||||
|
||||
USER $BUILD_USER
|
||||
|
||||
# rocm-ci branch contains instrumentation needed for loss curves and perf
|
||||
RUN git clone https://github.com/microsoft/huggingface-transformers.git &&\
|
||||
cd huggingface-transformers &&\
|
||||
|
|
@ -59,3 +52,8 @@ RUN pip install \
|
|||
|
||||
RUN pip install torch-ort --no-dependencies
|
||||
ENV ORTMODULE_ONNX_OPSET_VERSION=15
|
||||
|
||||
ARG BUILD_UID=1001
|
||||
ARG BUILD_USER=onnxruntimedev
|
||||
RUN adduser --uid $BUILD_UID $BUILD_USER
|
||||
WORKDIR /home/$BUILD_USER
|
||||
|
|
|
|||
Loading…
Reference in a new issue