From 8ede2f139e72bcdd1092c06314e1eb2b9f37775e Mon Sep 17 00:00:00 2001 From: PeixuanZuo <94887879+PeixuanZuo@users.noreply.github.com> Date: Mon, 24 Jul 2023 13:57:48 +0800 Subject: [PATCH] [ROCm] Optimize ROCm CI pipeline 2 (#16691) - Set `KERNEL_EXPLORER_TEST_USE_CUPY=1` to replace numpy with cupy on kernel explorer test. KERNEL_EXPLORER_TEST_USE_CUPY=0 The CPU utilization is shown as below: ![image](https://github.com/microsoft/onnxruntime/assets/94887879/91724b78-0b4e-4cbd-ad88-83cad9976472) KERNEL_EXPLORER_TEST_USE_CUPY=1 The CPU utilization is shown as below: ![image](https://github.com/microsoft/onnxruntime/assets/94887879/58239911-667c-4d5f-bb78-deca60d0266f) - Use `Bash@3`. - Update shell script. --- .../orttraining-pai-ci-pipeline.yml | 21 ++++---- .../migraphx-ci-pipeline-env.Dockerfile | 3 +- tools/ci_build/github/pai/pai_clean_device.sh | 51 +++++++++++-------- .../pai/pai_huggingface_bert_large_test.sh | 21 +++++--- .../pai/rocm-ci-pipeline-env.Dockerfile | 4 +- 5 files changed, 59 insertions(+), 41 deletions(-) diff --git a/tools/ci_build/github/azure-pipelines/orttraining-pai-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/orttraining-pai-ci-pipeline.yml index 4e4073ae84..1295f84142 100644 --- a/tools/ci_build/github/azure-pipelines/orttraining-pai-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/orttraining-pai-ci-pipeline.yml @@ -133,12 +133,11 @@ jobs: DockerBuildArgs: "--build-arg BUILD_UID=$( id -u )" Repository: onnxruntimetrainingrocm-cibuild-rocm$(RocmVersion)-test - - task: CmdLine@2 + - task: Bash@3 inputs: - script: |- - echo "Select agent: $(Agent.Name), GPU: $HIP_VISIBLE_DEVICES, render: $DRIVER_RENDER" - bash $(Build.SourcesDirectory)/tools/ci_build/github/pai/pai_clean_device.sh $(Agent.Name) $HIP_VISIBLE_DEVICES - workingDirectory: $(Build.SourcesDirectory) + targetType: filePath + filePath: $(Build.SourcesDirectory)/tools/ci_build/github/pai/pai_clean_device.sh + arguments: -n $(Agent.Name) -d $HIP_VISIBLE_DEVICES -r $DRIVER_RENDER displayName: 'Check ROCm Environment' - task: CmdLine@2 @@ -182,6 +181,7 @@ jobs: set -ex; \ export KERNEL_EXPLORER_BUILD_DIR=/build/$(BuildConfig); \ export KERNEL_EXPLORER_BATCHED_GEMM_MAX_BATCH_SIZE=8; \ + export KERNEL_EXPLORER_TEST_USE_CUPY=1; \ pytest /onnxruntime_src/onnxruntime/python/tools/kernel_explorer/ -n 8 --reruns 1 --durations=100" workingDirectory: $(Build.SourcesDirectory) displayName: 'Run kernel explorer tests' @@ -206,7 +206,7 @@ jobs: set -ex; \ export PYTHONPATH=/build/$(BuildConfig); \ python -m onnxruntime.training.ortmodule.torch_cpp_extensions.install; \ - bash /onnxruntime_src/tools/ci_build/github/pai/pai_huggingface_bert_large_test.sh $(RocmVersion)" + bash /onnxruntime_src/tools/ci_build/github/pai/pai_huggingface_bert_large_test.sh -v $(RocmVersion)" workingDirectory: $(Build.SourcesDirectory) displayName: 'Run Python Hugging-Face BERT-L test' condition: succeededOrFailed() @@ -250,11 +250,12 @@ jobs: displayName: 'Run orttraining_ortmodule_tests.py' condition: succeededOrFailed() - - task: CmdLine@2 + + - task: Bash@3 inputs: - script: |- - bash $(Build.SourcesDirectory)/tools/ci_build/github/pai/pai_clean_device.sh $(Agent.Name) $HIP_VISIBLE_DEVICES - workingDirectory: $(Build.SourcesDirectory) + targetType: filePath + filePath: $(Build.SourcesDirectory)/tools/ci_build/github/pai/pai_clean_device.sh + arguments: -n $(Agent.Name) -d $HIP_VISIBLE_DEVICES -r $DRIVER_RENDER displayName: 'Clean ROCm Environment' condition: always() diff --git a/tools/ci_build/github/linux/docker/migraphx-ci-pipeline-env.Dockerfile b/tools/ci_build/github/linux/docker/migraphx-ci-pipeline-env.Dockerfile index 90e9731a35..d3bca26875 100644 --- a/tools/ci_build/github/linux/docker/migraphx-ci-pipeline-env.Dockerfile +++ b/tools/ci_build/github/linux/docker/migraphx-ci-pipeline-env.Dockerfile @@ -6,7 +6,8 @@ ARG MIGRAPHX_VERSION=rocm-5.5.0 ENV DEBIAN_FRONTEND noninteractive ENV MIGRAPHX_DISABLE_FAST_GELU=1 -RUN apt-get clean && apt-get update -y && apt-get upgrade -y && apt-get install -y locales unzip +RUN apt-get update -y && apt-get upgrade -y && apt-get autoremove -y && \ + apt-get install -y locales unzip && apt-get clean -y RUN locale-gen en_US.UTF-8 RUN update-locale LANG=en_US.UTF-8 ENV LC_ALL C.UTF-8 diff --git a/tools/ci_build/github/pai/pai_clean_device.sh b/tools/ci_build/github/pai/pai_clean_device.sh index 4ebbcf4920..98b680d4f4 100755 --- a/tools/ci_build/github/pai/pai_clean_device.sh +++ b/tools/ci_build/github/pai/pai_clean_device.sh @@ -1,38 +1,47 @@ #!/bin/bash set -ex -agentName=$1 -target_device=$2 -echo "agent name $agentName" -echo "agent target device : $target_device" +usage() { echo "Usage: $0 [-n ] [-d ] [-r ]" 1>&2; exit 1; } -echo -e "\n ---- rocm-smi" +while getopts "n:d:r:" parameter_Option +do case "${parameter_Option}" +in +n) AGENT_NAME=${OPTARG};; +d) TARGET_DEVICE=${OPTARG};; +r) DRIVER_RENDER=${OPTARG};; +*) usage ;; +esac +done + +echo "Agent Name: $AGENT_NAME, Target Device: $TARGET_DEVICE, Driver Render: $DRIVER_RENDER" + +echo -e "\n ---- Execute rocm-smi" rocm-smi -echo -e "\n ---- rocm-smi --showpids" +echo -e "\n ---- Execute rocm-smi --showpids" rocm-smi --showpids -echo -e "\n ---- rocm-smi --showpidgpus" +echo -e "\n ---- Execute rocm-smi --showpidgpus" rocm-smi --showpidgpus -echo -e "\n ---- rocm-smi --showpids detail" +echo -e "\n ---- Execute rocm-smi --showpids detail" rocm-smi --showpids | awk '$1 ~/[0-9]+/{if((NR>6)) {print $1}}' | xargs -I {} ps {} -echo -e "\n ---- rocm-smi --showmeminfo" +echo -e "\n ---- Execute rocm-smi --showmeminfo" rocm-smi --showmeminfo vram vis_vram gtt -echo -e "\n ---- Clean up the process that is using the target device" -gpu_details=$(rocm-smi --showpidgpus) -pid_lines=$(echo "$gpu_details" | grep -n "DRM device" | cut -d ":" -f 1) -pid_lines_array=($pid_lines) +echo -e "\n ---- Clean up processes that use the target device $TARGET_DEVICE" +GPU_USED_BY_PIDS=$(rocm-smi --showpidgpus) +PID_NUMBERS_LINES=$(echo "$GPU_USED_BY_PIDS" | grep -n "DRM device" | cut -d ":" -f 1) +PID_NUMBERS_LINES_ARRAY=($PID_NUMBERS_LINES) -for ((i = 0; i < ${#pid_lines_array[@]}; i++)); do - pid_line=${pid_lines_array[$i]} - pid=$(echo "$gpu_details" | awk '{print $2}' | sed -n "${pid_line}p") - gpu_line=$((pid_line + 1)) - pid_gpu=$(echo "$gpu_details" | sed -n "${gpu_line}p" | sed -e 's/^[ ]*//g' | sed -e 's/[ ]*$//g') - if [ "$pid_gpu" == "$target_device" ]; then - echo "kill pid: $pid, gpu: $pid_gpu" - kill -9 $pid +for ((i = 0; i < ${#PID_NUMBERS_LINES_ARRAY[@]}; i++)); do + PID_NUMBER_LINE=${PID_NUMBERS_LINES_ARRAY[$i]} + PID_NUMBER=$(echo "$GPU_USED_BY_PIDS" | awk '{print $2}' | sed -n "${PID_NUMBER_LINE}p") + GPU_USED_BY_PID_LINE=$((PID_NUMBER_LINE + 1)) + GPU_USED_BY_PID=$(echo "$GPU_USED_BY_PIDS" | sed -n "${GPU_USED_BY_PID_LINE}p" | sed -e 's/^[ ]*//g' | sed -e 's/[ ]*$//g') + if [ "$GPU_USED_BY_PID" == "$TARGET_DEVICE" ]; then + echo "kill pid: $PID_NUMBER, using gpu: $GPU_USED_BY_PID" + kill -9 "$PID_NUMBER" fi done diff --git a/tools/ci_build/github/pai/pai_huggingface_bert_large_test.sh b/tools/ci_build/github/pai/pai_huggingface_bert_large_test.sh index 3b8e828fe9..fb4dbeb2e7 100644 --- a/tools/ci_build/github/pai/pai_huggingface_bert_large_test.sh +++ b/tools/ci_build/github/pai/pai_huggingface_bert_large_test.sh @@ -2,15 +2,22 @@ set -ex -rocm_version=$1 -mi200_gpus=$(rocm-smi --showproductname | grep -c "MI250" | xargs) +usage() { echo "Usage: $0 [-v ]" 1>&2; exit 1; } -echo "mi200_gpus: $mi200_gpus" +while getopts "v:" parameter_Option +do case "${parameter_Option}" +in +v) ROCM_VERSION=${OPTARG};; +*) usage ;; +esac +done -if [ "$mi200_gpus" -gt "0" ]; then - result_file=ci-mi200.huggingface.bert-large-rocm${rocm_version}.json +MI200_DEVICE_NUMBERS=$(rocm-smi --showproductname | grep -c "MI250" | xargs) + +if [ "$MI200_DEVICE_NUMBERS" -gt "0" ]; then + RESULT_FILE=ci-mi200.huggingface.bert-large-rocm${ROCM_VERSION}.json else - result_file=ci-mi100.huggingface.bert-large-rocm${rocm_version}.json + RESULT_FILE=ci-mi100.huggingface.bert-large-rocm${ROCM_VERSION}.json fi python \ @@ -33,4 +40,4 @@ cat ci-pipeline-actual.json python /onnxruntime_src/orttraining/tools/ci_test/compare_huggingface.py \ ci-pipeline-actual.json \ - /onnxruntime_src/orttraining/tools/ci_test/results/${result_file} + /onnxruntime_src/orttraining/tools/ci_test/results/"$RESULT_FILE" diff --git a/tools/ci_build/github/pai/rocm-ci-pipeline-env.Dockerfile b/tools/ci_build/github/pai/rocm-ci-pipeline-env.Dockerfile index 540aeaf351..7540856913 100644 --- a/tools/ci_build/github/pai/rocm-ci-pipeline-env.Dockerfile +++ b/tools/ci_build/github/pai/rocm-ci-pipeline-env.Dockerfile @@ -1,6 +1,6 @@ -FROM rocm/pytorch:rocm5.5_ubuntu20.04_py3.8_pytorch_1.13.1 +FROM rocm/cupy:rocm5.5.0_ubuntu20.04_py3.8_pytorch2.0.0_cupy13.0.0 -RUN apt-get update -y && apt-get upgrade -y +RUN apt-get update -y && apt-get upgrade -y && apt-get autoremove -y && apt-get clean -y WORKDIR /stage