pytorch/.github/workflows/_rocm-test.yml

# TODO: this looks sort of similar to _linux-test, but there are like a dozen
# places where you would have to insert an if statement. Probably it's better to
# just use a different workflow altogether

name: test

on:
  workflow_call:
    inputs:
      build-environment:
        required: true
        type: string
        description: Top-level label for what's being built/tested.
      test-matrix:
        required: true
        type: string
        description: JSON description of what test configs to run.
      docker-image:
        required: true
        type: string
        description: Docker image to run in.
      sync-tag:
        required: false
        type: string
        default: ""
        description: |
          If this is set, our linter will use this to make sure that every other
          job with the same `sync-tag` is identical.

    secrets:
      AWS_OSSCI_METRICS_V2_ACCESS_KEY_ID:
        required: true
        description: access key id for test stats upload
      AWS_OSSCI_METRICS_V2_SECRET_ACCESS_KEY:
        required: true
        description: secret acess key for test stats upload

env:
  GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}

jobs:
  test:
    # Don't run on forked repos.
    if: github.repository_owner == 'pytorch'
    timeout-minutes: 300
    strategy:
      matrix: ${{ fromJSON(inputs.test-matrix) }}
      fail-fast: false
    runs-on: ${{ matrix.runner }}
    steps:
      # [see note: pytorch repo ref]
      - name: Checkout PyTorch
        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
        with:
          no-sudo: true

      - name: Setup ROCm
        uses: ./.github/actions/setup-rocm

      - name: Pull docker image
        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
        with:
          docker-image: ${{ inputs.docker-image }}

      - name: Start monitoring script
        id: monitor-script
        shell: bash
        run: |
          python3 -m pip install psutil==5.9.1
          python3 -m pip install pynvml==11.4.1
          python3 -m tools.stats.monitor > usage_log.txt 2>&1 &
          echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}"

      - name: Download build artifacts
        uses: ./.github/actions/download-build-artifacts
        with:
          name: ${{ inputs.build-environment }}

      - name: Parse ref
        id: parse-ref
        run: .github/scripts/parse_ref.py

      - name: Test
        id: test
        env:
          BUILD_ENVIRONMENT: ${{ inputs.build-environment }}
          PR_NUMBER: ${{ github.event.pull_request.number }}
          BRANCH: ${{ steps.parse-ref.outputs.branch }}
          SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
          PYTORCH_RETRY_TEST_CASES: 1
          PYTORCH_OVERRIDE_FLAKY_SIGNAL: 1
          TEST_CONFIG: ${{ matrix.config }}
          SHARD_NUMBER: ${{ matrix.shard }}
          NUM_TEST_SHARDS: ${{ matrix.num_shards }}
          PR_BODY: ${{ github.event.pull_request.body }}
          SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
          DOCKER_IMAGE: ${{ inputs.docker-image }}
          XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
          PYTORCH_JIT_ENABLE_NVFUSER: 1
          PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0' }}
          PYTORCH_TEST_RERUN_DISABLED_TESTS: ${{ matrix.rerun_disabled_tests && '1' || '0' }}
        timeout-minutes: 270
        run: |
          set -x

          if [[ $TEST_CONFIG == 'multigpu' ]]; then
            TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh
          elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then
            TEST_COMMAND=.jenkins/caffe2/test.sh
          else
            TEST_COMMAND=.jenkins/pytorch/test.sh
          fi

          COMMIT_MESSAGES=$(git cherry -v "origin/${GIT_DEFAULT_BRANCH:-master}")

          # sanitize the input commit message and PR body here:
          #
          # trim all new lines from commit messages + PR_BODY to avoid issues with batch environment
          # variable copying. see https://github.com/pytorch/pytorch/pull/80043#issuecomment-1167796028
          COMMIT_MESSAGES="${COMMIT_MESSAGES//[$'\n\r']}"
          PR_BODY="${PR_BODY//[$'\n\r']}"

          # then trim all special characters like single and double quotes to avoid unescaped inputs to
          # wreak havoc internally
          export COMMIT_MESSAGES="${COMMIT_MESSAGES//[\'\"]}"
          export PR_BODY="${PR_BODY//[\'\"]}"

          # detached container should get cleaned up by teardown_ec2_linux
          # TODO: Stop building test binaries as part of the build phase
          # Used for GPU_FLAG since that doesn't play nice
          # shellcheck disable=SC2086,SC2090
          container_name=$(docker run \
            ${GPU_FLAG:-} \
            -e BUILD_ENVIRONMENT \
            -e PR_NUMBER \
            -e GITHUB_ACTIONS \
            -e BRANCH \
            -e SHA1 \
            -e AWS_DEFAULT_REGION \
            -e IN_WHEEL_TEST \
            -e SHARD_NUMBER \
            -e TEST_CONFIG \
            -e NUM_TEST_SHARDS \
            -e PR_BODY \
            -e COMMIT_MESSAGES \
            -e PYTORCH_RETRY_TEST_CASES \
            -e PYTORCH_OVERRIDE_FLAKY_SIGNAL \
            -e MAX_JOBS="$(nproc --ignore=2)" \
            -e SCCACHE_BUCKET \
            -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
            -e PYTORCH_TEST_CUDA_MEM_LEAK_CHECK \
            -e PYTORCH_TEST_RERUN_DISABLED_TESTS \
            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
            --ulimit stack=10485760:83886080 \
            --security-opt seccomp=unconfined \
            --cap-add=SYS_PTRACE \
            --shm-size="8g" \
            --tty \
            --detach \
            --name="${container_name}" \
            --user jenkins \
            -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
            -w /var/lib/jenkins/workspace \
            "${DOCKER_IMAGE}"
          )
          # save container name for later step
          echo "CONTAINER_NAME=${container_name}" >> "$GITHUB_ENV"
          # jenkins user does not have write permission to mounted workspace; work-around by copying within container to jenkins home
          docker exec -t "${container_name}" sh -c "cd .. && cp -R workspace pytorch && cd pytorch && pip install dist/*.whl && ${TEST_COMMAND}"

      - name: Save test results
        if: always()
        run: |
          # copy test results back to the mounted workspace, needed sudo, resulting permissions were correct
          docker exec -t "${{ env.CONTAINER_NAME }}" sh -c "cd ../pytorch && sudo cp -R test/test-reports ../workspace/test"

      - name: Get workflow job id
        id: get-job-id
        uses: ./.github/actions/get-workflow-job-id
        if: always()
        with:
          github-token: ${{ secrets.GITHUB_TOKEN }}

      - name: Stop monitoring script
        if: always() && steps.monitor-script.outputs.monitor-script-pid
        shell: bash
        continue-on-error: true
        env:
          MONITOR_SCRIPT_PID: ${{ steps.monitor-script.outputs.monitor-script-pid }}
        run: |
          kill "$MONITOR_SCRIPT_PID"

      - name: Upload test artifacts
        uses: ./.github/actions/upload-test-artifacts
        if: always() && (steps.test.conclusion == 'success' || steps.test.conclusion == 'failure')
        with:
          use-gha: true
          file-suffix: ${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}_${{ steps.get-job-id.outputs.job-id }}

      - name: Upload test statistics
        if: always()
        env:
          AWS_DEFAULT_REGION: us-east-1
          GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
          BRANCH: ${{ steps.parse-ref.outputs.branch }}
          TEST_CONFIG: ${{ matrix.config }}
          SHARD_NUMBER: ${{ matrix.shard }}
          BUILD_ENVIRONMENT: ${{ inputs.build-environment }}
          PR_NUMBER: ${{ github.event.pull_request.number }}
          PYTORCH_RETRY_TEST_CASES: 1
          PYTORCH_OVERRIDE_FLAKY_SIGNAL: 1
          SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
          TAG: ${{ steps.parse-ref.outputs.tag }}
          WORKFLOW_ID: ${{ github.run_id }}
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_OSSCI_METRICS_V2_ACCESS_KEY_ID }}
          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_OSSCI_METRICS_V2_SECRET_ACCESS_KEY }}
          GHA_WORKFLOW_JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
        shell: bash
        run: |
          set -x
          python3 -m pip install -r requirements.txt
          python3 -m pip install boto3==1.19.12
          python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test

      - name: Teardown ROCm
        if: always()
        shell: bash
        run: |
          # Only stop the docker container we started since there might be multiple runners on this host.
          docker stop "${{ env.CONTAINER_NAME }}" || true
          # Prune all of the docker containers.
          # Might fail if a prune is already in progress by another runner.
          docker container prune -f || true
          # Prune everything docker if there are more than 10 images (~200GB).
          # This is easier than using a time filter, e.g., "until=24h".
          # Might fail if a prune is already in progress by another runner.
          image_count=$(docker images | wc -l)
          if [[ ${image_count} -gt 10 ]]; then
              echo "Purging all docker caches"
              docker system prune -af || true
          else
              echo "Will not purge docker, only ${image_count} images found"
          fi