[ROCm] Optimize ROCm CI pipeline 2 (#16691)

- Set `KERNEL_EXPLORER_TEST_USE_CUPY=1` to replace numpy with cupy on
kernel explorer test.

KERNEL_EXPLORER_TEST_USE_CUPY=0 The CPU utilization is shown as below:

![image](https://github.com/microsoft/onnxruntime/assets/94887879/91724b78-0b4e-4cbd-ad88-83cad9976472)

KERNEL_EXPLORER_TEST_USE_CUPY=1 The CPU utilization is shown as below:

![image](https://github.com/microsoft/onnxruntime/assets/94887879/58239911-667c-4d5f-bb78-deca60d0266f)


- Use `Bash@3`.
- Update shell script.
This commit is contained in:
PeixuanZuo 2023-07-24 13:57:48 +08:00 committed by GitHub
parent 21ef14476b
commit 8ede2f139e
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 59 additions and 41 deletions

View file

@ -133,12 +133,11 @@ jobs:
DockerBuildArgs: "--build-arg BUILD_UID=$( id -u )"
Repository: onnxruntimetrainingrocm-cibuild-rocm$(RocmVersion)-test
- task: CmdLine@2
- task: Bash@3
inputs:
script: |-
echo "Select agent: $(Agent.Name), GPU: $HIP_VISIBLE_DEVICES, render: $DRIVER_RENDER"
bash $(Build.SourcesDirectory)/tools/ci_build/github/pai/pai_clean_device.sh $(Agent.Name) $HIP_VISIBLE_DEVICES
workingDirectory: $(Build.SourcesDirectory)
targetType: filePath
filePath: $(Build.SourcesDirectory)/tools/ci_build/github/pai/pai_clean_device.sh
arguments: -n $(Agent.Name) -d $HIP_VISIBLE_DEVICES -r $DRIVER_RENDER
displayName: 'Check ROCm Environment'
- task: CmdLine@2
@ -182,6 +181,7 @@ jobs:
set -ex; \
export KERNEL_EXPLORER_BUILD_DIR=/build/$(BuildConfig); \
export KERNEL_EXPLORER_BATCHED_GEMM_MAX_BATCH_SIZE=8; \
export KERNEL_EXPLORER_TEST_USE_CUPY=1; \
pytest /onnxruntime_src/onnxruntime/python/tools/kernel_explorer/ -n 8 --reruns 1 --durations=100"
workingDirectory: $(Build.SourcesDirectory)
displayName: 'Run kernel explorer tests'
@ -206,7 +206,7 @@ jobs:
set -ex; \
export PYTHONPATH=/build/$(BuildConfig); \
python -m onnxruntime.training.ortmodule.torch_cpp_extensions.install; \
bash /onnxruntime_src/tools/ci_build/github/pai/pai_huggingface_bert_large_test.sh $(RocmVersion)"
bash /onnxruntime_src/tools/ci_build/github/pai/pai_huggingface_bert_large_test.sh -v $(RocmVersion)"
workingDirectory: $(Build.SourcesDirectory)
displayName: 'Run Python Hugging-Face BERT-L test'
condition: succeededOrFailed()
@ -250,11 +250,12 @@ jobs:
displayName: 'Run orttraining_ortmodule_tests.py'
condition: succeededOrFailed()
- task: CmdLine@2
- task: Bash@3
inputs:
script: |-
bash $(Build.SourcesDirectory)/tools/ci_build/github/pai/pai_clean_device.sh $(Agent.Name) $HIP_VISIBLE_DEVICES
workingDirectory: $(Build.SourcesDirectory)
targetType: filePath
filePath: $(Build.SourcesDirectory)/tools/ci_build/github/pai/pai_clean_device.sh
arguments: -n $(Agent.Name) -d $HIP_VISIBLE_DEVICES -r $DRIVER_RENDER
displayName: 'Clean ROCm Environment'
condition: always()

View file

@ -6,7 +6,8 @@ ARG MIGRAPHX_VERSION=rocm-5.5.0
ENV DEBIAN_FRONTEND noninteractive
ENV MIGRAPHX_DISABLE_FAST_GELU=1
RUN apt-get clean && apt-get update -y && apt-get upgrade -y && apt-get install -y locales unzip
RUN apt-get update -y && apt-get upgrade -y && apt-get autoremove -y && \
apt-get install -y locales unzip && apt-get clean -y
RUN locale-gen en_US.UTF-8
RUN update-locale LANG=en_US.UTF-8
ENV LC_ALL C.UTF-8

View file

@ -1,38 +1,47 @@
#!/bin/bash
set -ex
agentName=$1
target_device=$2
echo "agent name $agentName"
echo "agent target device : $target_device"
usage() { echo "Usage: $0 [-n <agent name>] [-d <target device>] [-r <driver render>]" 1>&2; exit 1; }
echo -e "\n ---- rocm-smi"
while getopts "n:d:r:" parameter_Option
do case "${parameter_Option}"
in
n) AGENT_NAME=${OPTARG};;
d) TARGET_DEVICE=${OPTARG};;
r) DRIVER_RENDER=${OPTARG};;
*) usage ;;
esac
done
echo "Agent Name: $AGENT_NAME, Target Device: $TARGET_DEVICE, Driver Render: $DRIVER_RENDER"
echo -e "\n ---- Execute rocm-smi"
rocm-smi
echo -e "\n ---- rocm-smi --showpids"
echo -e "\n ---- Execute rocm-smi --showpids"
rocm-smi --showpids
echo -e "\n ---- rocm-smi --showpidgpus"
echo -e "\n ---- Execute rocm-smi --showpidgpus"
rocm-smi --showpidgpus
echo -e "\n ---- rocm-smi --showpids detail"
echo -e "\n ---- Execute rocm-smi --showpids detail"
rocm-smi --showpids | awk '$1 ~/[0-9]+/{if((NR>6)) {print $1}}' | xargs -I {} ps {}
echo -e "\n ---- rocm-smi --showmeminfo"
echo -e "\n ---- Execute rocm-smi --showmeminfo"
rocm-smi --showmeminfo vram vis_vram gtt
echo -e "\n ---- Clean up the process that is using the target device"
gpu_details=$(rocm-smi --showpidgpus)
pid_lines=$(echo "$gpu_details" | grep -n "DRM device" | cut -d ":" -f 1)
pid_lines_array=($pid_lines)
echo -e "\n ---- Clean up processes that use the target device $TARGET_DEVICE"
GPU_USED_BY_PIDS=$(rocm-smi --showpidgpus)
PID_NUMBERS_LINES=$(echo "$GPU_USED_BY_PIDS" | grep -n "DRM device" | cut -d ":" -f 1)
PID_NUMBERS_LINES_ARRAY=($PID_NUMBERS_LINES)
for ((i = 0; i < ${#pid_lines_array[@]}; i++)); do
pid_line=${pid_lines_array[$i]}
pid=$(echo "$gpu_details" | awk '{print $2}' | sed -n "${pid_line}p")
gpu_line=$((pid_line + 1))
pid_gpu=$(echo "$gpu_details" | sed -n "${gpu_line}p" | sed -e 's/^[ ]*//g' | sed -e 's/[ ]*$//g')
if [ "$pid_gpu" == "$target_device" ]; then
echo "kill pid: $pid, gpu: $pid_gpu"
kill -9 $pid
for ((i = 0; i < ${#PID_NUMBERS_LINES_ARRAY[@]}; i++)); do
PID_NUMBER_LINE=${PID_NUMBERS_LINES_ARRAY[$i]}
PID_NUMBER=$(echo "$GPU_USED_BY_PIDS" | awk '{print $2}' | sed -n "${PID_NUMBER_LINE}p")
GPU_USED_BY_PID_LINE=$((PID_NUMBER_LINE + 1))
GPU_USED_BY_PID=$(echo "$GPU_USED_BY_PIDS" | sed -n "${GPU_USED_BY_PID_LINE}p" | sed -e 's/^[ ]*//g' | sed -e 's/[ ]*$//g')
if [ "$GPU_USED_BY_PID" == "$TARGET_DEVICE" ]; then
echo "kill pid: $PID_NUMBER, using gpu: $GPU_USED_BY_PID"
kill -9 "$PID_NUMBER"
fi
done

View file

@ -2,15 +2,22 @@
set -ex
rocm_version=$1
mi200_gpus=$(rocm-smi --showproductname | grep -c "MI250" | xargs)
usage() { echo "Usage: $0 [-v <ROCm version>]" 1>&2; exit 1; }
echo "mi200_gpus: $mi200_gpus"
while getopts "v:" parameter_Option
do case "${parameter_Option}"
in
v) ROCM_VERSION=${OPTARG};;
*) usage ;;
esac
done
if [ "$mi200_gpus" -gt "0" ]; then
result_file=ci-mi200.huggingface.bert-large-rocm${rocm_version}.json
MI200_DEVICE_NUMBERS=$(rocm-smi --showproductname | grep -c "MI250" | xargs)
if [ "$MI200_DEVICE_NUMBERS" -gt "0" ]; then
RESULT_FILE=ci-mi200.huggingface.bert-large-rocm${ROCM_VERSION}.json
else
result_file=ci-mi100.huggingface.bert-large-rocm${rocm_version}.json
RESULT_FILE=ci-mi100.huggingface.bert-large-rocm${ROCM_VERSION}.json
fi
python \
@ -33,4 +40,4 @@ cat ci-pipeline-actual.json
python /onnxruntime_src/orttraining/tools/ci_test/compare_huggingface.py \
ci-pipeline-actual.json \
/onnxruntime_src/orttraining/tools/ci_test/results/${result_file}
/onnxruntime_src/orttraining/tools/ci_test/results/"$RESULT_FILE"

View file

@ -1,6 +1,6 @@
FROM rocm/pytorch:rocm5.5_ubuntu20.04_py3.8_pytorch_1.13.1
FROM rocm/cupy:rocm5.5.0_ubuntu20.04_py3.8_pytorch2.0.0_cupy13.0.0
RUN apt-get update -y && apt-get upgrade -y
RUN apt-get update -y && apt-get upgrade -y && apt-get autoremove -y && apt-get clean -y
WORKDIR /stage