pytorch/.github/workflows/_linux-test.yml

name: linux-test

on:
  workflow_call:
    inputs:
      build-environment:
        required: true
        type: string
        description: Top-level label for what's being built/tested.
      test-matrix:
        required: true
        type: string
        description: JSON description of what test configs to run.
      docker-image:
        required: true
        type: string
        description: Docker image to run in.
      sync-tag:
        required: false
        type: string
        default: ""
        description: |
          If this is set, our linter will use this to make sure that every other
          job with the same `sync-tag` is identical.
      timeout-minutes:
        required: false
        type: number
        default: 240
        description: |
          Set the maximum (in minutes) how long the workflow should take to finish
      use-gha:
        required: false
        type: string
        default: ""
        description: If set to any value, upload to GHA. Otherwise upload to S3.

env:
  GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}

jobs:
  # This needs to be run right before the test starts so that it can gather the
  # latest labels from the PR
  filter:
    runs-on: [self-hosted, linux.large]
    outputs:
      test-matrix: ${{ steps.filter.outputs.test-matrix }}
      is-test-matrix-empty: ${{ steps.filter.outputs.is-test-matrix-empty }}
      keep-going: ${{ steps.filter.outputs.keep-going }}
    steps:
      - name: Checkout PyTorch
        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
        with:
          fetch-depth: 1
          submodules: false

      - name: Select all requested test configurations
        id: filter
        uses: ./.github/actions/filter-test-configs
        with:
          github-token: ${{ secrets.GITHUB_TOKEN }}
          test-matrix: ${{ inputs.test-matrix }}

  test:
    needs: filter
    # Don't run on forked repos or empty test matrix
    if: github.repository_owner == 'pytorch' && needs.filter.outputs.is-test-matrix-empty == 'False'
    strategy:
      matrix: ${{ fromJSON(needs.filter.outputs.test-matrix) }}
      fail-fast: false
    runs-on: ${{ matrix.runner }}
    timeout-minutes: ${{ inputs.timeout-minutes }}
    steps:
      - name: Setup SSH (Click me for login details)
        uses: pytorch/test-infra/.github/actions/setup-ssh@main
        with:
          github-secret: ${{ secrets.GITHUB_TOKEN }}
          instructions: |
            All testing is done inside the container, to start an interactive session run:
              docker exec -it $(docker container ps --format '{{.ID}}') bash

      - name: Checkout PyTorch
        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master

      - name: Setup Linux
        uses: ./.github/actions/setup-linux

      - name: Pull docker image
        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
        with:
          docker-image: ${{ inputs.docker-image }}

      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
        id: install-nvidia-driver
        uses: pytorch/test-infra/.github/actions/setup-nvidia@main
        if: contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu')

      - name: Lock NVIDIA A100 40GB Frequency
        run: |
          sudo nvidia-smi -pm 1
          sudo nvidia-smi -ac 1215,1410
          nvidia-smi
        if: contains(matrix.runner, 'a100')

      - name: Start monitoring script
        id: monitor-script
        shell: bash
        continue-on-error: true
        run: |
          python3 -m pip install psutil==5.9.1 nvidia-ml-py==11.525.84
          python3 -m tools.stats.monitor > usage_log.txt 2>&1 &
          echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}"

      - name: Download build artifacts
        uses: ./.github/actions/download-build-artifacts
        with:
          name: ${{ inputs.build-environment }}

      - name: Parse ref
        id: parse-ref
        run: .github/scripts/parse_ref.py

      - name: Set Test step time
        id: test-timeout
        shell: bash
        env:
          JOB_TIMEOUT: ${{ inputs.timeout-minutes }}
        run: |
          echo "timeout=$((JOB_TIMEOUT-30))" >> "${GITHUB_OUTPUT}"

      - name: Test
        id: test
        timeout-minutes: ${{ fromJson(steps.test-timeout.outputs.timeout) }}
        env:
          BUILD_ENVIRONMENT: ${{ inputs.build-environment }}
          PR_NUMBER: ${{ github.event.pull_request.number }}
          BRANCH: ${{ steps.parse-ref.outputs.branch }}
          SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
          BASE_SHA: ${{ github.event.pull_request.base.sha || github.sha }}
          PYTORCH_RETRY_TEST_CASES: 1
          PYTORCH_OVERRIDE_FLAKY_SIGNAL: 1
          TEST_CONFIG: ${{ matrix.config }}
          SHARD_NUMBER: ${{ matrix.shard }}
          NUM_TEST_SHARDS: ${{ matrix.num_shards }}
          PR_BODY: ${{ github.event.pull_request.body }}
          CONTINUE_THROUGH_ERROR: ${{ needs.filter.outputs.keep-going }}
          SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
          SCCACHE_S3_KEY_PREFIX: ${{ github.workflow }}
          SHM_SIZE: ${{ contains(inputs.build-environment, 'cuda') && '2g' || '1g' }}
          DOCKER_IMAGE: ${{ inputs.docker-image }}
          XLA_CUDA: ${{ contains(inputs.build-environment, 'xla') && '0' || '' }}
          XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
          PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0' }}
          PYTORCH_TEST_RERUN_DISABLED_TESTS: ${{ matrix.rerun_disabled_tests && '1' || '0' }}
        run: |
          set -x

          if [[ $TEST_CONFIG == 'multigpu' ]]; then
            TEST_COMMAND=.ci/pytorch/multigpu-test.sh
          elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then
            TEST_COMMAND=.ci/onnx/test.sh
          else
            TEST_COMMAND=.ci/pytorch/test.sh
          fi

          COMMIT_MESSAGES=$(git cherry -v "origin/${GIT_DEFAULT_BRANCH:-master}")

          # sanitize the input commit message and PR body here:
          #
          # trim all new lines from commit messages + PR_BODY to avoid issues with batch environment
          # variable copying. see https://github.com/pytorch/pytorch/pull/80043#issuecomment-1167796028
          COMMIT_MESSAGES="${COMMIT_MESSAGES//[$'\n\r']}"
          PR_BODY="${PR_BODY//[$'\n\r']}"

          # then trim all special characters like single and double quotes to avoid unescaped inputs to
          # wreak havoc internally
          export COMMIT_MESSAGES="${COMMIT_MESSAGES//[\'\"]}"
          export PR_BODY="${PR_BODY//[\'\"]}"

          # detached container should get cleaned up by teardown_ec2_linux
          # TODO: Stop building test binaries as part of the build phase
          # Used for GPU_FLAG since that doesn't play nice
          # shellcheck disable=SC2086,SC2090
          container_name=$(docker run \
            ${GPU_FLAG:-} \
            -e BUILD_ENVIRONMENT \
            -e PR_NUMBER \
            -e GITHUB_ACTIONS \
            -e BASE_SHA \
            -e BRANCH \
            -e SHA1 \
            -e AWS_DEFAULT_REGION \
            -e IN_WHEEL_TEST \
            -e SHARD_NUMBER \
            -e TEST_CONFIG \
            -e NUM_TEST_SHARDS \
            -e PR_BODY \
            -e COMMIT_MESSAGES \
            -e CONTINUE_THROUGH_ERROR \
            -e PYTORCH_RETRY_TEST_CASES \
            -e PYTORCH_OVERRIDE_FLAKY_SIGNAL \
            -e PR_LABELS \
            -e MAX_JOBS="$(nproc --ignore=2)" \
            -e SCCACHE_BUCKET \
            -e SCCACHE_S3_KEY_PREFIX \
            -e XLA_CUDA \
            -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
            -e PYTORCH_TEST_CUDA_MEM_LEAK_CHECK \
            -e PYTORCH_TEST_RERUN_DISABLED_TESTS \
            -e SKIP_SCCACHE_INITIALIZATION=1 \
            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
            --ulimit stack=10485760:83886080 \
            --security-opt seccomp=unconfined \
            --cap-add=SYS_PTRACE \
            --ipc=host \
            --shm-size="${SHM_SIZE}" \
            --tty \
            --detach \
            --name="${container_name}" \
            --user jenkins \
            -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
            -w /var/lib/jenkins/workspace \
            "${DOCKER_IMAGE}"
          )
          echo "DOCKER_CONTAINER_ID=${container_name}" >> "${GITHUB_ENV}"
          docker exec -t "${container_name}" sh -c "pip install $(echo dist/*.whl)[opt-einsum] && ${TEST_COMMAND}"

      - name: Print remaining test logs
        shell: bash
        if: always()
        run: |
          cat test/**/*.log || true

      - name: Get workflow job id
        id: get-job-id
        uses: ./.github/actions/get-workflow-job-id
        if: always()
        with:
          github-token: ${{ secrets.GITHUB_TOKEN }}

      - name: Stop monitoring script
        if: always() && steps.monitor-script.outputs.monitor-script-pid
        shell: bash
        continue-on-error: true
        env:
          MONITOR_SCRIPT_PID: ${{ steps.monitor-script.outputs.monitor-script-pid }}
        run: |
          kill "$MONITOR_SCRIPT_PID"

      - name: Upload test artifacts
        uses: ./.github/actions/upload-test-artifacts
        if: always() && steps.test.conclusion && steps.test.conclusion != 'skipped'
        with:
          file-suffix: ${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}_${{ steps.get-job-id.outputs.job-id }}
          use-gha: ${{ inputs.use-gha }}

      - name: Collect backtraces from coredumps (if any)
        if: always()
        run: |
          # shellcheck disable=SC2156
          find . -iname "core.[1-9]*" -exec docker exec "${DOCKER_CONTAINER_ID}" sh -c "gdb python {} -ex 'bt' -ex 'q'" \;

      - name: Store Core dumps on S3
        uses: seemethere/upload-artifact-s3@v5
        if: failure()
        with:
          name: coredumps-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}
          retention-days: 14
          if-no-files-found: ignore
          path: ./**/core.[1-9]*

      - name: Teardown Linux
        uses: pytorch/test-infra/.github/actions/teardown-linux@main
        if: always()

      # NB: We are currently having an intermittent GPU-related issue on G5 runners with
      # A10G GPU. Once this happens, trying to reset the GPU as done in setup-nvidia does
      # not seem to help. Here are some symptoms:
      #   * Calling nvidia-smi timeouts after 60 second
      #   * Fail to run nvidia-smi with an unable to determine the device handle for GPU
      #     unknown error
      #   * Test fails with a missing CUDA GPU error when initializing CUDA in PyTorch
      #   * Run docker --gpus all fails with error response from daemon
      #
      # As both the root cause and recovery path are unclear, let's take the runner out of
      # service so that it doesn't get any more jobs
      - name: Check NVIDIA driver installation step
        if:
          failure() &&
          ((steps.install-nvidia-driver.conclusion && steps.install-nvidia-driver.conclusion == 'failure') || (contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu')))
        shell: bash
        env:
          RUNNER_WORKSPACE: ${{ runner.workspace }}
        run: |
          set +e

          nvidia-smi
          NVIDIA_SMI_STATUS=$?

          # These are acceptable return code from nvidia-smi as copied from setup-nvidia GitHub action
          if [ "$NVIDIA_SMI_STATUS" -ne 0 ] && [ "$NVIDIA_SMI_STATUS" -ne 14 ]; then
            echo "NVIDIA driver installation has failed, shutting down the runner..."
            .github/scripts/stop_runner_service.sh
          fi

      - name: Check other unrecoverable error (Run this last)
        if: failure() && contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu')
        shell: bash
        env:
          RUNNER_WORKSPACE: ${{ runner.workspace }}
        run: |
          set +e

          UNRECOVERABLE_ERRORS=(
            "docker: Error response from daemon"
          )

          for ERROR in "${UNRECOVERABLE_ERRORS[@]}"
          do
            grep -Rli "${ERROR}" "${RUNNER_WORKSPACE}/../../_diag/pages"
            RC=$?

            # If GPU crashes, stop the runner to prevent it from receiving new jobs
            if [[ "${RC}" == "0" ]]; then
              echo "The runner has encoutered an unrecoverable error (${ERROR}), shutting it down..."
              .github/scripts/stop_runner_service.sh
            fi
          done