mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-06-04 23:59:56 +00:00
[ROCm] Optimize ROCm CI pipeline 2 (#16691)
- Set `KERNEL_EXPLORER_TEST_USE_CUPY=1` to replace numpy with cupy on kernel explorer test. KERNEL_EXPLORER_TEST_USE_CUPY=0 The CPU utilization is shown as below:  KERNEL_EXPLORER_TEST_USE_CUPY=1 The CPU utilization is shown as below:  - Use `Bash@3`. - Update shell script.
This commit is contained in:
parent
21ef14476b
commit
8ede2f139e
5 changed files with 59 additions and 41 deletions
|
|
@ -133,12 +133,11 @@ jobs:
|
|||
DockerBuildArgs: "--build-arg BUILD_UID=$( id -u )"
|
||||
Repository: onnxruntimetrainingrocm-cibuild-rocm$(RocmVersion)-test
|
||||
|
||||
- task: CmdLine@2
|
||||
- task: Bash@3
|
||||
inputs:
|
||||
script: |-
|
||||
echo "Select agent: $(Agent.Name), GPU: $HIP_VISIBLE_DEVICES, render: $DRIVER_RENDER"
|
||||
bash $(Build.SourcesDirectory)/tools/ci_build/github/pai/pai_clean_device.sh $(Agent.Name) $HIP_VISIBLE_DEVICES
|
||||
workingDirectory: $(Build.SourcesDirectory)
|
||||
targetType: filePath
|
||||
filePath: $(Build.SourcesDirectory)/tools/ci_build/github/pai/pai_clean_device.sh
|
||||
arguments: -n $(Agent.Name) -d $HIP_VISIBLE_DEVICES -r $DRIVER_RENDER
|
||||
displayName: 'Check ROCm Environment'
|
||||
|
||||
- task: CmdLine@2
|
||||
|
|
@ -182,6 +181,7 @@ jobs:
|
|||
set -ex; \
|
||||
export KERNEL_EXPLORER_BUILD_DIR=/build/$(BuildConfig); \
|
||||
export KERNEL_EXPLORER_BATCHED_GEMM_MAX_BATCH_SIZE=8; \
|
||||
export KERNEL_EXPLORER_TEST_USE_CUPY=1; \
|
||||
pytest /onnxruntime_src/onnxruntime/python/tools/kernel_explorer/ -n 8 --reruns 1 --durations=100"
|
||||
workingDirectory: $(Build.SourcesDirectory)
|
||||
displayName: 'Run kernel explorer tests'
|
||||
|
|
@ -206,7 +206,7 @@ jobs:
|
|||
set -ex; \
|
||||
export PYTHONPATH=/build/$(BuildConfig); \
|
||||
python -m onnxruntime.training.ortmodule.torch_cpp_extensions.install; \
|
||||
bash /onnxruntime_src/tools/ci_build/github/pai/pai_huggingface_bert_large_test.sh $(RocmVersion)"
|
||||
bash /onnxruntime_src/tools/ci_build/github/pai/pai_huggingface_bert_large_test.sh -v $(RocmVersion)"
|
||||
workingDirectory: $(Build.SourcesDirectory)
|
||||
displayName: 'Run Python Hugging-Face BERT-L test'
|
||||
condition: succeededOrFailed()
|
||||
|
|
@ -250,11 +250,12 @@ jobs:
|
|||
displayName: 'Run orttraining_ortmodule_tests.py'
|
||||
condition: succeededOrFailed()
|
||||
|
||||
- task: CmdLine@2
|
||||
|
||||
- task: Bash@3
|
||||
inputs:
|
||||
script: |-
|
||||
bash $(Build.SourcesDirectory)/tools/ci_build/github/pai/pai_clean_device.sh $(Agent.Name) $HIP_VISIBLE_DEVICES
|
||||
workingDirectory: $(Build.SourcesDirectory)
|
||||
targetType: filePath
|
||||
filePath: $(Build.SourcesDirectory)/tools/ci_build/github/pai/pai_clean_device.sh
|
||||
arguments: -n $(Agent.Name) -d $HIP_VISIBLE_DEVICES -r $DRIVER_RENDER
|
||||
displayName: 'Clean ROCm Environment'
|
||||
condition: always()
|
||||
|
||||
|
|
|
|||
|
|
@ -6,7 +6,8 @@ ARG MIGRAPHX_VERSION=rocm-5.5.0
|
|||
ENV DEBIAN_FRONTEND noninteractive
|
||||
ENV MIGRAPHX_DISABLE_FAST_GELU=1
|
||||
|
||||
RUN apt-get clean && apt-get update -y && apt-get upgrade -y && apt-get install -y locales unzip
|
||||
RUN apt-get update -y && apt-get upgrade -y && apt-get autoremove -y && \
|
||||
apt-get install -y locales unzip && apt-get clean -y
|
||||
RUN locale-gen en_US.UTF-8
|
||||
RUN update-locale LANG=en_US.UTF-8
|
||||
ENV LC_ALL C.UTF-8
|
||||
|
|
|
|||
|
|
@ -1,38 +1,47 @@
|
|||
#!/bin/bash
|
||||
set -ex
|
||||
|
||||
agentName=$1
|
||||
target_device=$2
|
||||
echo "agent name $agentName"
|
||||
echo "agent target device : $target_device"
|
||||
usage() { echo "Usage: $0 [-n <agent name>] [-d <target device>] [-r <driver render>]" 1>&2; exit 1; }
|
||||
|
||||
echo -e "\n ---- rocm-smi"
|
||||
while getopts "n:d:r:" parameter_Option
|
||||
do case "${parameter_Option}"
|
||||
in
|
||||
n) AGENT_NAME=${OPTARG};;
|
||||
d) TARGET_DEVICE=${OPTARG};;
|
||||
r) DRIVER_RENDER=${OPTARG};;
|
||||
*) usage ;;
|
||||
esac
|
||||
done
|
||||
|
||||
echo "Agent Name: $AGENT_NAME, Target Device: $TARGET_DEVICE, Driver Render: $DRIVER_RENDER"
|
||||
|
||||
echo -e "\n ---- Execute rocm-smi"
|
||||
rocm-smi
|
||||
|
||||
echo -e "\n ---- rocm-smi --showpids"
|
||||
echo -e "\n ---- Execute rocm-smi --showpids"
|
||||
rocm-smi --showpids
|
||||
|
||||
echo -e "\n ---- rocm-smi --showpidgpus"
|
||||
echo -e "\n ---- Execute rocm-smi --showpidgpus"
|
||||
rocm-smi --showpidgpus
|
||||
|
||||
echo -e "\n ---- rocm-smi --showpids detail"
|
||||
echo -e "\n ---- Execute rocm-smi --showpids detail"
|
||||
rocm-smi --showpids | awk '$1 ~/[0-9]+/{if((NR>6)) {print $1}}' | xargs -I {} ps {}
|
||||
|
||||
echo -e "\n ---- rocm-smi --showmeminfo"
|
||||
echo -e "\n ---- Execute rocm-smi --showmeminfo"
|
||||
rocm-smi --showmeminfo vram vis_vram gtt
|
||||
|
||||
echo -e "\n ---- Clean up the process that is using the target device"
|
||||
gpu_details=$(rocm-smi --showpidgpus)
|
||||
pid_lines=$(echo "$gpu_details" | grep -n "DRM device" | cut -d ":" -f 1)
|
||||
pid_lines_array=($pid_lines)
|
||||
echo -e "\n ---- Clean up processes that use the target device $TARGET_DEVICE"
|
||||
GPU_USED_BY_PIDS=$(rocm-smi --showpidgpus)
|
||||
PID_NUMBERS_LINES=$(echo "$GPU_USED_BY_PIDS" | grep -n "DRM device" | cut -d ":" -f 1)
|
||||
PID_NUMBERS_LINES_ARRAY=($PID_NUMBERS_LINES)
|
||||
|
||||
for ((i = 0; i < ${#pid_lines_array[@]}; i++)); do
|
||||
pid_line=${pid_lines_array[$i]}
|
||||
pid=$(echo "$gpu_details" | awk '{print $2}' | sed -n "${pid_line}p")
|
||||
gpu_line=$((pid_line + 1))
|
||||
pid_gpu=$(echo "$gpu_details" | sed -n "${gpu_line}p" | sed -e 's/^[ ]*//g' | sed -e 's/[ ]*$//g')
|
||||
if [ "$pid_gpu" == "$target_device" ]; then
|
||||
echo "kill pid: $pid, gpu: $pid_gpu"
|
||||
kill -9 $pid
|
||||
for ((i = 0; i < ${#PID_NUMBERS_LINES_ARRAY[@]}; i++)); do
|
||||
PID_NUMBER_LINE=${PID_NUMBERS_LINES_ARRAY[$i]}
|
||||
PID_NUMBER=$(echo "$GPU_USED_BY_PIDS" | awk '{print $2}' | sed -n "${PID_NUMBER_LINE}p")
|
||||
GPU_USED_BY_PID_LINE=$((PID_NUMBER_LINE + 1))
|
||||
GPU_USED_BY_PID=$(echo "$GPU_USED_BY_PIDS" | sed -n "${GPU_USED_BY_PID_LINE}p" | sed -e 's/^[ ]*//g' | sed -e 's/[ ]*$//g')
|
||||
if [ "$GPU_USED_BY_PID" == "$TARGET_DEVICE" ]; then
|
||||
echo "kill pid: $PID_NUMBER, using gpu: $GPU_USED_BY_PID"
|
||||
kill -9 "$PID_NUMBER"
|
||||
fi
|
||||
done
|
||||
|
|
|
|||
|
|
@ -2,15 +2,22 @@
|
|||
|
||||
set -ex
|
||||
|
||||
rocm_version=$1
|
||||
mi200_gpus=$(rocm-smi --showproductname | grep -c "MI250" | xargs)
|
||||
usage() { echo "Usage: $0 [-v <ROCm version>]" 1>&2; exit 1; }
|
||||
|
||||
echo "mi200_gpus: $mi200_gpus"
|
||||
while getopts "v:" parameter_Option
|
||||
do case "${parameter_Option}"
|
||||
in
|
||||
v) ROCM_VERSION=${OPTARG};;
|
||||
*) usage ;;
|
||||
esac
|
||||
done
|
||||
|
||||
if [ "$mi200_gpus" -gt "0" ]; then
|
||||
result_file=ci-mi200.huggingface.bert-large-rocm${rocm_version}.json
|
||||
MI200_DEVICE_NUMBERS=$(rocm-smi --showproductname | grep -c "MI250" | xargs)
|
||||
|
||||
if [ "$MI200_DEVICE_NUMBERS" -gt "0" ]; then
|
||||
RESULT_FILE=ci-mi200.huggingface.bert-large-rocm${ROCM_VERSION}.json
|
||||
else
|
||||
result_file=ci-mi100.huggingface.bert-large-rocm${rocm_version}.json
|
||||
RESULT_FILE=ci-mi100.huggingface.bert-large-rocm${ROCM_VERSION}.json
|
||||
fi
|
||||
|
||||
python \
|
||||
|
|
@ -33,4 +40,4 @@ cat ci-pipeline-actual.json
|
|||
|
||||
python /onnxruntime_src/orttraining/tools/ci_test/compare_huggingface.py \
|
||||
ci-pipeline-actual.json \
|
||||
/onnxruntime_src/orttraining/tools/ci_test/results/${result_file}
|
||||
/onnxruntime_src/orttraining/tools/ci_test/results/"$RESULT_FILE"
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
FROM rocm/pytorch:rocm5.5_ubuntu20.04_py3.8_pytorch_1.13.1
|
||||
FROM rocm/cupy:rocm5.5.0_ubuntu20.04_py3.8_pytorch2.0.0_cupy13.0.0
|
||||
|
||||
RUN apt-get update -y && apt-get upgrade -y
|
||||
RUN apt-get update -y && apt-get upgrade -y && apt-get autoremove -y && apt-get clean -y
|
||||
|
||||
WORKDIR /stage
|
||||
|
||||
|
|
|
|||
Loading…
Reference in a new issue