name: linux-test on: workflow_call: inputs: build-environment: required: true type: string description: Top-level label for what's being built/tested. test-matrix: required: true type: string description: JSON description of what test configs to run. docker-image: required: true type: string description: Docker image to run in. sync-tag: required: false type: string default: "" description: | If this is set, our linter will use this to make sure that every other job with the same `sync-tag` is identical. timeout-minutes: required: false type: number default: 240 description: | Set the maximum (in minutes) how long the workflow should take to finish use-gha: required: false type: string default: "" description: If set to any value, upload to GHA. Otherwise upload to S3. dashboard-tag: required: false type: string default: "" secrets: HUGGING_FACE_HUB_TOKEN: required: false description: | HF Auth token to avoid rate limits when downloading models or datasets from hub env: GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }} jobs: test: # Don't run on forked repos or empty test matrix if: github.repository_owner == 'pytorch' && toJSON(fromJSON(inputs.test-matrix).include) != '[]' strategy: matrix: ${{ fromJSON(inputs.test-matrix) }} fail-fast: false runs-on: ${{ matrix.runner }} timeout-minutes: ${{ matrix.mem_leak_check == 'mem_leak_check' && 600 || inputs.timeout-minutes }} steps: - name: Setup SSH (Click me for login details) uses: pytorch/test-infra/.github/actions/setup-ssh@main if: ${{ !contains(matrix.runner, 'gcp.a100') }} with: github-secret: ${{ secrets.GITHUB_TOKEN }} instructions: | All testing is done inside the container, to start an interactive session run: docker exec -it $(docker container ps --format '{{.ID}}') bash - name: Checkout PyTorch uses: pytorch/pytorch/.github/actions/checkout-pytorch@main - name: Setup Linux uses: ./.github/actions/setup-linux - name: Calculate docker image id: calculate-docker-image uses: pytorch/test-infra/.github/actions/calculate-docker-image@main with: docker-image-name: ${{ inputs.docker-image }} - name: Pull docker image uses: pytorch/test-infra/.github/actions/pull-docker-image@main with: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG id: install-nvidia-driver uses: pytorch/test-infra/.github/actions/setup-nvidia@main if: contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') - name: Lock NVIDIA A100 40GB Frequency run: | sudo nvidia-smi -pm 1 sudo nvidia-smi -ac 1215,1410 nvidia-smi if: contains(matrix.runner, 'a100') - name: Start monitoring script id: monitor-script shell: bash continue-on-error: true run: | python3 -m pip install psutil==5.9.1 nvidia-ml-py==11.525.84 python3 -m tools.stats.monitor > usage_log.txt 2>&1 & echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}" - name: Download build artifacts uses: ./.github/actions/download-build-artifacts with: name: ${{ inputs.build-environment }} - name: Parse ref id: parse-ref run: .github/scripts/parse_ref.py - name: Check for keep-going label and re-enabled test issues # This uses the filter-test-configs action because it conviniently # checks for labels and re-enabled test issues. It does not actually do # any filtering. All filtering is done in the build step. id: keep-going uses: ./.github/actions/filter-test-configs with: github-token: ${{ secrets.GITHUB_TOKEN }} test-matrix: ${{ inputs.test-matrix }} - name: Download pytest cache uses: ./.github/actions/pytest-cache-download continue-on-error: true with: cache_dir: .pytest_cache job_identifier: ${{ github.workflow }}_${{ inputs.build-environment }}_${{ github.job }}_${{ matrix.config }} - name: Set Test step time id: test-timeout shell: bash env: JOB_TIMEOUT: ${{ matrix.mem_leak_check == 'mem_leak_check' && 600 || inputs.timeout-minutes }} run: | echo "timeout=$((JOB_TIMEOUT-30))" >> "${GITHUB_OUTPUT}" - name: Test id: test timeout-minutes: ${{ fromJson(steps.test-timeout.outputs.timeout) }} env: BUILD_ENVIRONMENT: ${{ inputs.build-environment }} PR_NUMBER: ${{ github.event.pull_request.number }} GITHUB_REPOSITORY: ${{ github.repository }} GITHUB_WORKFLOW: ${{ github.workflow }} GITHUB_JOB: ${{ github.job }} GITHUB_RUN_ID: ${{ github.run_id }} GITHUB_RUN_NUMBER: ${{ github.run_number }} GITHUB_RUN_ATTEMPT: ${{ github.run_attempt }} BRANCH: ${{ steps.parse-ref.outputs.branch }} SHA1: ${{ github.event.pull_request.head.sha || github.sha }} BASE_SHA: ${{ github.event.pull_request.base.sha || github.sha }} PYTORCH_RETRY_TEST_CASES: 1 PYTORCH_OVERRIDE_FLAKY_SIGNAL: 1 TEST_CONFIG: ${{ matrix.config }} SHARD_NUMBER: ${{ matrix.shard }} NUM_TEST_SHARDS: ${{ matrix.num_shards }} REENABLED_ISSUES: ${{ steps.keep-going.outputs.reenabled-issues }} CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }} SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2 SCCACHE_S3_KEY_PREFIX: ${{ github.workflow }} SHM_SIZE: ${{ contains(inputs.build-environment, 'cuda') && '2g' || '1g' }} DOCKER_IMAGE: ${{ inputs.docker-image }} XLA_CUDA: ${{ contains(inputs.build-environment, 'xla') && '0' || '' }} XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0' }} PYTORCH_TEST_RERUN_DISABLED_TESTS: ${{ matrix.rerun_disabled_tests && '1' || '0' }} DASHBOARD_TAG: ${{ inputs.dashboard-tag }} HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} run: | set -x if [[ $TEST_CONFIG == 'multigpu' ]]; then TEST_COMMAND=.ci/pytorch/multigpu-test.sh elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then TEST_COMMAND=.ci/onnx/test.sh else TEST_COMMAND=.ci/pytorch/test.sh fi # detached container should get cleaned up by teardown_ec2_linux # TODO: Stop building test binaries as part of the build phase # Used for GPU_FLAG since that doesn't play nice # shellcheck disable=SC2086,SC2090 container_name=$(docker run \ ${GPU_FLAG:-} \ -e BUILD_ENVIRONMENT \ -e PR_NUMBER \ -e GITHUB_ACTIONS \ -e GITHUB_REPOSITORY \ -e GITHUB_WORKFLOW \ -e GITHUB_JOB \ -e GITHUB_RUN_ID \ -e GITHUB_RUN_NUMBER \ -e GITHUB_RUN_ATTEMPT \ -e BASE_SHA \ -e BRANCH \ -e SHA1 \ -e AWS_DEFAULT_REGION \ -e IN_WHEEL_TEST \ -e SHARD_NUMBER \ -e TEST_CONFIG \ -e NUM_TEST_SHARDS \ -e REENABLED_ISSUES \ -e CONTINUE_THROUGH_ERROR \ -e PYTORCH_RETRY_TEST_CASES \ -e PYTORCH_OVERRIDE_FLAKY_SIGNAL \ -e PR_LABELS \ -e MAX_JOBS="$(nproc --ignore=2)" \ -e SCCACHE_BUCKET \ -e SCCACHE_S3_KEY_PREFIX \ -e XLA_CUDA \ -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ -e PYTORCH_TEST_CUDA_MEM_LEAK_CHECK \ -e PYTORCH_TEST_RERUN_DISABLED_TESTS \ -e SKIP_SCCACHE_INITIALIZATION=1 \ -e HUGGING_FACE_HUB_TOKEN \ -e DASHBOARD_TAG \ --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ --ulimit stack=10485760:83886080 \ --security-opt seccomp=unconfined \ --cap-add=SYS_PTRACE \ --ipc=host \ --shm-size="${SHM_SIZE}" \ --tty \ --detach \ --name="${container_name}" \ --user jenkins \ -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ -w /var/lib/jenkins/workspace \ "${DOCKER_IMAGE}" ) # Propagate download.pytorch.org IP to container grep download.pytorch.org /etc/hosts | docker exec -i "${container_name}" sudo bash -c "/bin/cat >> /etc/hosts" echo "DOCKER_CONTAINER_ID=${container_name}" >> "${GITHUB_ENV}" docker exec -t "${container_name}" sh -c "pip install $(echo dist/*.whl)[opt-einsum] && ${TEST_COMMAND}" - name: Upload pytest cache if tests failed uses: ./.github/actions/pytest-cache-upload continue-on-error: true if: failure() && steps.test.conclusion && steps.test.conclusion == 'failure' with: cache_dir: .pytest_cache shard: ${{ matrix.shard }} job_identifier: ${{ github.workflow }}_${{ inputs.build-environment }}_${{ github.job }}_${{ matrix.config }} - name: Print remaining test logs shell: bash if: always() && steps.test.conclusion run: | cat test/**/*_toprint.log || true - name: Get workflow job id id: get-job-id uses: ./.github/actions/get-workflow-job-id if: always() with: github-token: ${{ secrets.GITHUB_TOKEN }} - name: Stop monitoring script if: always() && steps.monitor-script.outputs.monitor-script-pid shell: bash continue-on-error: true env: MONITOR_SCRIPT_PID: ${{ steps.monitor-script.outputs.monitor-script-pid }} run: | kill "$MONITOR_SCRIPT_PID" - name: Upload test artifacts uses: ./.github/actions/upload-test-artifacts if: always() && steps.test.conclusion && steps.test.conclusion != 'skipped' with: file-suffix: ${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}_${{ steps.get-job-id.outputs.job-id }} use-gha: ${{ inputs.use-gha }} - name: Collect backtraces from coredumps (if any) if: always() run: | # shellcheck disable=SC2156 find . -iname "core.[1-9]*" -exec docker exec "${DOCKER_CONTAINER_ID}" sh -c "gdb python {} -ex 'bt' -ex 'q'" \; - name: Store Core dumps on S3 uses: seemethere/upload-artifact-s3@v5 if: failure() with: name: coredumps-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }} retention-days: 14 if-no-files-found: ignore path: ./**/core.[1-9]* - name: Teardown Linux uses: pytorch/test-infra/.github/actions/teardown-linux@main if: always() # NB: We are currently having an intermittent GPU-related issue on G5 runners with # A10G GPU. Once this happens, trying to reset the GPU as done in setup-nvidia does # not seem to help. Here are some symptoms: # * Calling nvidia-smi timeouts after 60 second # * Fail to run nvidia-smi with an unable to determine the device handle for GPU # unknown error # * Test fails with a missing CUDA GPU error when initializing CUDA in PyTorch # * Run docker --gpus all fails with error response from daemon # # As both the root cause and recovery path are unclear, let's take the runner out of # service so that it doesn't get any more jobs - name: Check NVIDIA driver installation step if: failure() && steps.install-nvidia-driver.outcome && steps.install-nvidia-driver.outcome != 'skipped' shell: bash env: RUNNER_WORKSPACE: ${{ runner.workspace }} run: | set +e set -x nvidia-smi # NB: Surprisingly, nvidia-smi command returns successfully with return code 0 even in # the case where the driver has already crashed as it still can get the driver version # and some basic information like the bus ID. However, the rest of the information # would be missing (ERR!), for example: # # +-----------------------------------------------------------------------------+ # | NVIDIA-SMI 525.89.02 Driver Version: 525.89.02 CUDA Version: 12.0 | # |-------------------------------+----------------------+----------------------+ # | GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC | # | Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. | # | | | MIG M. | # |===============================+======================+======================| # | 0 ERR! Off | 00000000:00:1E.0 Off | ERR! | # |ERR! ERR! ERR! ERR! / ERR! | 4184MiB / 23028MiB | ERR! Default | # | | | ERR! | # +-------------------------------+----------------------+----------------------+ # # +-----------------------------------------------------------------------------+ # | Processes: | # | GPU GI CI PID Type Process name GPU Memory | # | ID ID Usage | # |=============================================================================| # +-----------------------------------------------------------------------------+ # # This should be reported as a failure instead as it will guarantee to fail when # Docker tries to run with --gpus all # # So, the correct check here is to query one of the missing piece of info like # GPU name, so that the command can fail accordingly nvidia-smi --query-gpu=gpu_name --format=csv,noheader --id=0 NVIDIA_SMI_STATUS=$? # These are acceptable return code from nvidia-smi as copied from setup-nvidia GitHub action if [ "$NVIDIA_SMI_STATUS" -ne 0 ] && [ "$NVIDIA_SMI_STATUS" -ne 14 ]; then echo "NVIDIA driver installation has failed, shutting down the runner..." .github/scripts/stop_runner_service.sh fi # For runner with multiple GPUs, we also want to confirm that the number of GPUs are the # power of 2, i.e. 1, 2, 4, or 8. This is to avoid flaky test issue when one GPU fails # https://github.com/pytorch/test-infra/issues/4000 GPU_COUNT=$(nvidia-smi --list-gpus | wc -l) NVIDIA_SMI_STATUS=$? # These are acceptable return code from nvidia-smi as copied from setup-nvidia GitHub action if [ "$NVIDIA_SMI_STATUS" -ne 0 ] && [ "$NVIDIA_SMI_STATUS" -ne 14 ]; then echo "NVIDIA driver installation has failed, shutting down the runner..." .github/scripts/stop_runner_service.sh fi # Check the GPU count to be a power of 2 if [ "$GPU_COUNT" -le 8 ] && [ "$GPU_COUNT" -ne 1 ] && [ "$GPU_COUNT" -ne 2 ] && [ "$GPU_COUNT" -ne 4 ] && [ "$GPU_COUNT" -ne 8 ]; then echo "NVIDIA driver detects $GPU_COUNT GPUs. The runner has a broken GPU, shutting it down..." .github/scripts/stop_runner_service.sh fi