name: linux-test on: workflow_call: inputs: build-environment: required: true type: string description: Top-level label for what's being built/tested. test-matrix: required: true type: string description: JSON description of what test configs to run. docker-image: required: true type: string description: Docker image to run in. sync-tag: required: false type: string default: "" description: | If this is set, our linter will use this to make sure that every other job with the same `sync-tag` is identical. timeout-minutes: required: false type: number default: 240 description: | Set the maximum (in minutes) how long the workflow should take to finish use-gha: required: false type: string default: "" description: If set to any value, upload to GHA. Otherwise upload to S3. dashboard-tag: required: false type: string default: "" s3-bucket: description: S3 bucket to download artifact required: false type: string default: "gha-artifacts" aws-role-to-assume: description: role to assume for downloading artifacts required: false type: string default: "" disable-monitor: description: | [Experimental] Disable utilization monitoring for tests. Currently, by default we disable the monitor job and only look for specific tests, since we are investigating the behaviour of the monitor script with different tests. required: false type: boolean default: false secrets: HUGGING_FACE_HUB_TOKEN: required: false description: | HF Auth token to avoid rate limits when downloading models or datasets from hub SCRIBE_GRAPHQL_ACCESS_TOKEN: required: false description: | FB app token to write to scribe endpoint env: GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }} jobs: test: # Don't run on forked repos or empty test matrix if: github.repository_owner == 'pytorch' && toJSON(fromJSON(inputs.test-matrix).include) != '[]' strategy: matrix: ${{ fromJSON(inputs.test-matrix) }} fail-fast: false environment: ${{ github.ref == 'refs/heads/main' && 'scribe-protected' || startsWith(github.ref, 'refs/heads/release/') && 'scribe-protected' || contains(github.event.pull_request.labels.*.name, 'ci-scribe') && 'scribe-pr' || '' }} runs-on: ${{ matrix.runner }} timeout-minutes: ${{ matrix.mem_leak_check == 'mem_leak_check' && 600 || inputs.timeout-minutes }} steps: - name: Setup SSH (Click me for login details) uses: pytorch/test-infra/.github/actions/setup-ssh@main if: ${{ !contains(matrix.runner, 'gcp.a100') && inputs.build-environment != 'linux-s390x-binary-manywheel' }} with: github-secret: ${{ secrets.GITHUB_TOKEN }} instructions: | All testing is done inside the container, to start an interactive session run: docker exec -it $(docker container ps --format '{{.ID}}') bash - name: Checkout PyTorch uses: pytorch/pytorch/.github/actions/checkout-pytorch@main with: no-sudo: true - name: Setup Linux uses: ./.github/actions/setup-linux if: inputs.build-environment != 'linux-s390x-binary-manywheel' - name: configure aws credentials if : ${{ inputs.aws-role-to-assume != '' && inputs.build-environment != 'linux-s390x-binary-manywheel' }} uses: aws-actions/configure-aws-credentials@v3 with: role-to-assume: ${{ inputs.aws-role-to-assume }} role-session-name: gha-linux-test aws-region: us-east-1 - name: Calculate docker image id: calculate-docker-image uses: pytorch/test-infra/.github/actions/calculate-docker-image@main if: inputs.build-environment != 'linux-s390x-binary-manywheel' with: docker-image-name: ${{ inputs.docker-image }} - name: Use following to pull public copy of the image id: print-ghcr-mirror if: inputs.build-environment != 'linux-s390x-binary-manywheel' env: ECR_DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }} shell: bash run: | tag=${ECR_DOCKER_IMAGE##*/} echo "docker pull ghcr.io/pytorch/ci-image:${tag/:/-}" - name: Pull docker image uses: pytorch/test-infra/.github/actions/pull-docker-image@main if: inputs.build-environment != 'linux-s390x-binary-manywheel' with: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} - name: Check if in a container runner shell: bash id: check_container_runner run: echo "IN_CONTAINER_RUNNER=$(if [ -f /.inarc ] || [ -f /.incontainer ]; then echo true ; else echo false; fi)" >> "$GITHUB_OUTPUT" - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG id: install-nvidia-driver uses: pytorch/test-infra/.github/actions/setup-nvidia@main if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' }} - name: Setup GPU_FLAG for docker run id: setup-gpu-flag run: echo "GPU_FLAG=--gpus all -e NVIDIA_DRIVER_CAPABILITIES=all" >> "${GITHUB_ENV}" if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'true' }} - name: Setup SCCACHE_SERVER_PORT environment for docker run when on container id: setup-sscache-port-flag run: echo "SCCACHE_SERVER_PORT_DOCKER_FLAG=-e SCCACHE_SERVER_PORT=$((RUNNER_UID + 4226))" >> "${GITHUB_ENV}" if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'true' }} - name: Lock NVIDIA A100 40GB Frequency run: | sudo nvidia-smi -pm 1 sudo nvidia-smi -ac 1215,1410 nvidia-smi if: ${{ contains(matrix.runner, 'a100') && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' }} - name: Get workflow job id id: get-job-id uses: ./.github/actions/get-workflow-job-id if: always() with: github-token: ${{ secrets.GITHUB_TOKEN }} - name: Start monitoring script id: monitor-script if: ${{ !inputs.disable-monitor }} shell: bash continue-on-error: true env: JOB_ID: ${{ steps.get-job-id.outputs.job-id }} JOB_NAME: ${{ steps.get-job-id.outputs.job-name }} WORKFLOW_NAME: ${{ github.workflow }} WORKFLOW_RUN_ID: ${{github.run_id}} run: | python3 -m pip install psutil==5.9.1 nvidia-ml-py==11.525.84 dataclasses_json==0.6.7 python3 -m tools.stats.monitor > usage_log.txt 2>&1 & echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}" - name: Download build artifacts uses: ./.github/actions/download-build-artifacts with: name: ${{ inputs.build-environment }} s3-bucket: ${{ inputs.s3-bucket }} use-gha: ${{ inputs.use-gha }} - name: Download TD artifacts continue-on-error: true uses: ./.github/actions/download-td-artifacts - name: Parse ref id: parse-ref run: .github/scripts/parse_ref.py - name: Check for keep-going label and re-enabled test issues # This uses the filter-test-configs action because it conviniently # checks for labels and re-enabled test issues. It does not actually do # any filtering. All filtering is done in the build step. id: keep-going uses: ./.github/actions/filter-test-configs with: github-token: ${{ secrets.GITHUB_TOKEN }} test-matrix: ${{ inputs.test-matrix }} job-name: ${{ steps.get-job-id.outputs.job-name }} - name: Set Test step time id: test-timeout shell: bash env: JOB_TIMEOUT: ${{ matrix.mem_leak_check == 'mem_leak_check' && 600 || inputs.timeout-minutes }} run: | echo "timeout=$((JOB_TIMEOUT-30))" >> "${GITHUB_OUTPUT}" - name: Test id: test timeout-minutes: ${{ fromJson(steps.test-timeout.outputs.timeout) }} env: BUILD_ENVIRONMENT: ${{ inputs.build-environment }} PR_NUMBER: ${{ github.event.pull_request.number }} GITHUB_REPOSITORY: ${{ github.repository }} GITHUB_WORKFLOW: ${{ github.workflow }} GITHUB_JOB: ${{ github.job }} GITHUB_RUN_ID: ${{ github.run_id }} GITHUB_RUN_NUMBER: ${{ github.run_number }} GITHUB_RUN_ATTEMPT: ${{ github.run_attempt }} JOB_ID: ${{ steps.get-job-id.outputs.job-id }} JOB_NAME: ${{ steps.get-job-id.outputs.job-name }} BRANCH: ${{ steps.parse-ref.outputs.branch }} SHA1: ${{ github.event.pull_request.head.sha || github.sha }} BASE_SHA: ${{ github.event.pull_request.base.sha || github.sha }} TEST_CONFIG: ${{ matrix.config }} SHARD_NUMBER: ${{ matrix.shard }} NUM_TEST_SHARDS: ${{ matrix.num_shards }} REENABLED_ISSUES: ${{ steps.keep-going.outputs.reenabled-issues }} CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }} VERBOSE_TEST_LOGS: ${{ steps.keep-going.outputs.ci-verbose-test-logs }} TEST_SHOWLOCALS: ${{ steps.keep-going.outputs.ci-test-showlocals }} NO_TEST_TIMEOUT: ${{ steps.keep-going.outputs.ci-no-test-timeout }} NO_TD: ${{ steps.keep-going.outputs.ci-no-td }} TD_DISTRIBUTED: ${{ steps.keep-going.outputs.ci-td-distributed }} # Do not set SCCACHE_S3_KEY_PREFIX to share the cache between all build jobs SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2 SCCACHE_REGION: us-east-1 SHM_SIZE: ${{ contains(inputs.build-environment, 'cuda') && '2g' || '1g' }} DOCKER_IMAGE: ${{ inputs.docker-image }} XLA_CUDA: ${{ contains(inputs.build-environment, 'xla') && '0' || '' }} XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0' }} PYTORCH_TEST_RERUN_DISABLED_TESTS: ${{ matrix.rerun_disabled_tests && '1' || '0' }} DASHBOARD_TAG: ${{ inputs.dashboard-tag }} HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }} IS_A100_RUNNER: ${{ contains(matrix.runner, 'a100') && '1' || '0' }} ARTIFACTS_FILE_SUFFIX: ${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}_${{ steps.get-job-id.outputs.job-id }} run: | set -x if [[ $TEST_CONFIG == 'multigpu' ]]; then TEST_COMMAND=.ci/pytorch/multigpu-test.sh elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then TEST_COMMAND=.ci/onnx/test.sh else TEST_COMMAND=.ci/pytorch/test.sh fi # Leaving 1GB for the runner and other things TOTAL_AVAILABLE_MEMORY_IN_GB=$(awk '/MemTotal/ { printf "%.3f \n", $2/1024/1024 - 1 }' /proc/meminfo) # https://docs.docker.com/engine/containers/resource_constraints/#--memory-swap-details, the 3GB swap # comes from https://github.com/pytorch/test-infra/pull/6058 TOTAL_MEMORY_WITH_SWAP=$(("${TOTAL_AVAILABLE_MEMORY_IN_GB%.*}" + 3)) if [[ ${BUILD_ENVIRONMENT} == *"s390x"* ]]; then SHM_OPTS= JENKINS_USER= # ensure that docker container cleanly exits in 12 hours # if for some reason cleanup action doesn't stop container # when job is cancelled DOCKER_SHELL_CMD="sleep 12h" # since some steps are skipped on s390x, if they are necessary, run them here env | grep '^GITHUB' >> "/tmp/github_env_${GITHUB_RUN_ID}" env | grep '^CI' >> "/tmp/github_env_${GITHUB_RUN_ID}" else SHM_OPTS="--shm-size=${SHM_SIZE}" JENKINS_USER="--user jenkins" DOCKER_SHELL_CMD= fi # detached container should get cleaned up by teardown_ec2_linux # TODO: Stop building test binaries as part of the build phase # Used for GPU_FLAG, SHM_OPTS, JENKINS_USER and DOCKER_SHELL_CMD since that doesn't play nice # shellcheck disable=SC2086,SC2090 container_name=$(docker run \ ${GPU_FLAG:-} \ ${SCCACHE_SERVER_PORT_DOCKER_FLAG:-} \ -e BUILD_ENVIRONMENT \ -e PR_NUMBER \ -e GITHUB_ACTIONS \ -e GITHUB_REPOSITORY \ -e GITHUB_WORKFLOW \ -e GITHUB_JOB \ -e GITHUB_RUN_ID \ -e GITHUB_RUN_NUMBER \ -e GITHUB_RUN_ATTEMPT \ -e JOB_ID \ -e JOB_NAME \ -e BASE_SHA \ -e BRANCH \ -e SHA1 \ -e AWS_DEFAULT_REGION \ -e IN_WHEEL_TEST \ -e SHARD_NUMBER \ -e TEST_CONFIG \ -e NUM_TEST_SHARDS \ -e REENABLED_ISSUES \ -e CONTINUE_THROUGH_ERROR \ -e VERBOSE_TEST_LOGS \ -e TEST_SHOWLOCALS \ -e NO_TEST_TIMEOUT \ -e NO_TD \ -e TD_DISTRIBUTED \ -e PR_LABELS \ -e MAX_JOBS="$(nproc --ignore=2)" \ -e SCCACHE_BUCKET \ -e SCCACHE_REGION \ -e XLA_CUDA \ -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ -e PYTORCH_TEST_CUDA_MEM_LEAK_CHECK \ -e PYTORCH_TEST_RERUN_DISABLED_TESTS \ -e SKIP_SCCACHE_INITIALIZATION=1 \ -e HUGGING_FACE_HUB_TOKEN \ -e SCRIBE_GRAPHQL_ACCESS_TOKEN \ -e DASHBOARD_TAG \ -e IS_A100_RUNNER \ -e ARTIFACTS_FILE_SUFFIX \ --memory="${TOTAL_AVAILABLE_MEMORY_IN_GB%.*}g" \ --memory-swap="${TOTAL_MEMORY_WITH_SWAP}g" \ --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ --security-opt seccomp=unconfined \ --cap-add=SYS_PTRACE \ --ipc=host \ ${SHM_OPTS} \ --tty \ --detach \ --name="${container_name}" \ ${JENKINS_USER} \ -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ -w /var/lib/jenkins/workspace \ "${DOCKER_IMAGE}" \ ${DOCKER_SHELL_CMD} ) # Propagate download.pytorch.org IP to container grep download.pytorch.org /etc/hosts | docker exec -i "${container_name}" sudo bash -c "/bin/cat >> /etc/hosts" echo "DOCKER_CONTAINER_ID=${container_name}" >> "${GITHUB_ENV}" if [[ ${BUILD_ENVIRONMENT} == *"s390x"* ]]; then docker exec -t "${container_name}" sh -c "python3 -m pip install -r .ci/docker/requirements-ci.txt" fi docker exec -t "${container_name}" sh -c "python3 -m pip install $(echo dist/*.whl)[opt-einsum] && ${TEST_COMMAND}" - name: Upload pytest cache if tests failed uses: ./.github/actions/pytest-cache-upload continue-on-error: true if: failure() && steps.test.conclusion && steps.test.conclusion == 'failure' with: cache_dir: .pytest_cache shard: ${{ matrix.shard }} sha: ${{ github.event.pull_request.head.sha || github.sha }} test_config: ${{ matrix.config }} job_identifier: ${{ github.workflow }}_${{ inputs.build-environment }} - name: Upload the benchmark results uses: pytorch/test-infra/.github/actions/upload-benchmark-results@main with: benchmark-results-dir: test/test-reports dry-run: false schema-version: v3 github-token: ${{ secrets.GITHUB_TOKEN }} - name: Print remaining test logs shell: bash if: always() && steps.test.conclusion run: | cat test/**/*_toprint.log || true - name: Stop monitoring script if: ${{ always() && steps.monitor-script.outputs.monitor-script-pid }} shell: bash continue-on-error: true env: MONITOR_SCRIPT_PID: ${{ steps.monitor-script.outputs.monitor-script-pid }} run: | kill "$MONITOR_SCRIPT_PID" - name: Upload test artifacts uses: ./.github/actions/upload-test-artifacts if: always() && steps.test.conclusion && steps.test.conclusion != 'skipped' with: file-suffix: ${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}_${{ steps.get-job-id.outputs.job-id }} use-gha: ${{ inputs.use-gha }} s3-bucket: ${{ inputs.s3-bucket }} - name: Collect backtraces from coredumps (if any) if: always() run: | # shellcheck disable=SC2156 find . -iname "core.[1-9]*" -exec docker exec "${DOCKER_CONTAINER_ID}" sh -c "gdb python {} -ex 'bt' -ex 'q'" \; - name: Store Core dumps on S3 uses: seemethere/upload-artifact-s3@v5 if: failure() with: name: coredumps-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }} retention-days: 14 if-no-files-found: ignore path: ./**/core.[1-9]* - name: Teardown Linux uses: pytorch/test-infra/.github/actions/teardown-linux@main if: always() && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' # NB: We are currently having an intermittent GPU-related issue on G5 runners with # A10G GPU. Once this happens, trying to reset the GPU as done in setup-nvidia does # not seem to help. Here are some symptoms: # * Calling nvidia-smi timeouts after 60 second # * Fail to run nvidia-smi with an unable to determine the device handle for GPU # unknown error # * Test fails with a missing CUDA GPU error when initializing CUDA in PyTorch # * Run docker --gpus all fails with error response from daemon # # As both the root cause and recovery path are unclear, let's take the runner out of # service so that it doesn't get any more jobs - name: Check NVIDIA driver installation step if: failure() && steps.install-nvidia-driver.outcome && steps.install-nvidia-driver.outcome != 'skipped' shell: bash env: RUNNER_WORKSPACE: ${{ runner.workspace }} run: | set +e set -x nvidia-smi # NB: Surprisingly, nvidia-smi command returns successfully with return code 0 even in # the case where the driver has already crashed as it still can get the driver version # and some basic information like the bus ID. However, the rest of the information # would be missing (ERR!), for example: # # +-----------------------------------------------------------------------------+ # | NVIDIA-SMI 525.89.02 Driver Version: 525.89.02 CUDA Version: 12.0 | # |-------------------------------+----------------------+----------------------+ # | GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC | # | Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. | # | | | MIG M. | # |===============================+======================+======================| # | 0 ERR! Off | 00000000:00:1E.0 Off | ERR! | # |ERR! ERR! ERR! ERR! / ERR! | 4184MiB / 23028MiB | ERR! Default | # | | | ERR! | # +-------------------------------+----------------------+----------------------+ # # +-----------------------------------------------------------------------------+ # | Processes: | # | GPU GI CI PID Type Process name GPU Memory | # | ID ID Usage | # |=============================================================================| # +-----------------------------------------------------------------------------+ # # This should be reported as a failure instead as it will guarantee to fail when # Docker tries to run with --gpus all # # So, the correct check here is to query one of the missing piece of info like # GPU name, so that the command can fail accordingly nvidia-smi --query-gpu=gpu_name --format=csv,noheader --id=0 NVIDIA_SMI_STATUS=$? # These are acceptable return code from nvidia-smi as copied from setup-nvidia GitHub action if [ "$NVIDIA_SMI_STATUS" -ne 0 ] && [ "$NVIDIA_SMI_STATUS" -ne 14 ]; then echo "NVIDIA driver installation has failed, shutting down the runner..." .github/scripts/stop_runner_service.sh fi # For runner with multiple GPUs, we also want to confirm that the number of GPUs are the # power of 2, i.e. 1, 2, 4, or 8. This is to avoid flaky test issue when one GPU fails # https://github.com/pytorch/test-infra/issues/4000 GPU_COUNT=$(nvidia-smi --list-gpus | wc -l) NVIDIA_SMI_STATUS=$? # These are acceptable return code from nvidia-smi as copied from setup-nvidia GitHub action if [ "$NVIDIA_SMI_STATUS" -ne 0 ] && [ "$NVIDIA_SMI_STATUS" -ne 14 ]; then echo "NVIDIA driver installation has failed, shutting down the runner..." .github/scripts/stop_runner_service.sh fi # Check the GPU count to be a power of 2 if [ "$GPU_COUNT" -le 8 ] && [ "$GPU_COUNT" -ne 1 ] && [ "$GPU_COUNT" -ne 2 ] && [ "$GPU_COUNT" -ne 4 ] && [ "$GPU_COUNT" -ne 8 ]; then echo "NVIDIA driver detects $GPU_COUNT GPUs. The runner has a broken GPU, shutting it down..." .github/scripts/stop_runner_service.sh fi - name: Cleanup docker if: always() && inputs.build-environment == 'linux-s390x-binary-manywheel' shell: bash run: | # on s390x stop the container for clean worker stop docker stop -a || true docker kill -a || true