pytorch/.github/workflows/_linux-test.yml
Michael Suo 769df7430d [lint] create a workflow consistency linter (#80200)
In order to maintain consistency between jobs, introduce a linter that
checks whether jobs sharing the same `sync-tag` are indeed the same.

`sync-tag` is just a dummy input on the reusable workflow. I chose to
use a dummy input over the following alternatives:
- The job's id isn't great, because we are likely to change a job's id
  (say, when upgrading CUDA or linux versions)
- The job's name doesn't work as we have build/test jobs that share the
  same name
Pull Request resolved: https://github.com/pytorch/pytorch/pull/80200
Approved by: https://github.com/janeyx99
2022-07-05 17:08:06 +00:00

203 lines
7.4 KiB
YAML

name: linux-test
on:
workflow_call:
inputs:
build-environment:
required: true
type: string
description: Top-level label for what's being built/tested.
test-matrix:
required: true
type: string
description: JSON description of what test configs to run.
docker-image:
required: true
type: string
description: Docker image to run in.
sync-tag:
required: false
type: string
default: ""
description: |
If this is set, our linter will use this to make sure that every other
job with the same `sync-tag` is identical.
env:
GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
jobs:
test:
# Don't run on forked repos.
if: github.repository_owner == 'pytorch'
strategy:
matrix: ${{ fromJSON(inputs.test-matrix) }}
fail-fast: false
runs-on: ${{ matrix.runner }}
steps:
# [see note: pytorch repo ref]
- name: Checkout PyTorch
uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
- name: Setup Linux
uses: ./.github/actions/setup-linux
- name: Setup SSH (Click me for login details)
uses: ./.github/actions/setup-ssh
with:
github-secret: ${{ secrets.GITHUB_TOKEN }}
- name: Pull docker image
uses: ./.github/actions/pull-docker-image
with:
docker-image: ${{ inputs.docker-image }}
- name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a
if: contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu')
with:
timeout_minutes: 10
max_attempts: 3
command: |
set -ex
bash .github/scripts/install_nvidia_utils_linux.sh
echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
- name: Download build artifacts
uses: ./.github/actions/download-build-artifacts
with:
name: ${{ inputs.build-environment }}
- name: Parse ref
id: parse-ref
run: .github/scripts/parse_ref.py
- name: Test
id: test
env:
BUILD_ENVIRONMENT: ${{ inputs.build-environment }}
PR_NUMBER: ${{ github.event.pull_request.number }}
BRANCH: ${{ steps.parse-ref.outputs.branch }}
SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
BASE_SHA: ${{ github.event.pull_request.base.sha || github.sha }}
PYTORCH_RETRY_TEST_CASES: 1
PYTORCH_OVERRIDE_FLAKY_SIGNAL: 1
TEST_CONFIG: ${{ matrix.config }}
SHARD_NUMBER: ${{ matrix.shard }}
NUM_TEST_SHARDS: ${{ matrix.num_shards }}
PR_BODY: ${{ github.event.pull_request.body }}
SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
SHM_SIZE: ${{ contains(inputs.build-environment, 'cuda') && '2g' || '1g' }}
DOCKER_IMAGE: ${{ inputs.docker-image }}
XLA_CUDA: ${{ contains(inputs.build-environment, 'xla') && '0' || '' }}
XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
timeout-minutes: 240
run: |
set -x
if [[ $TEST_CONFIG == 'multigpu' ]]; then
TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh
elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then
TEST_COMMAND=.jenkins/caffe2/test.sh
else
TEST_COMMAND=.jenkins/pytorch/test.sh
fi
COMMIT_MESSAGES=$(git cherry -v "origin/${GIT_DEFAULT_BRANCH:-master}")
# trim all new lines from commit messages + PR_BODY to avoid issues with batch environment
# variable copying. see https://github.com/pytorch/pytorch/pull/80043#issuecomment-1167796028
export COMMIT_MESSAGES="${COMMIT_MESSAGES//[$'\n\r']}"
export PR_BODY="${PR_BODY//[$'\n\r']}"
# detached container should get cleaned up by teardown_ec2_linux
# TODO: Stop building test binaries as part of the build phase
# Used for GPU_FLAG since that doesn't play nice
# shellcheck disable=SC2086,SC2090
container_name=$(docker run \
${GPU_FLAG:-} \
-e BUILD_ENVIRONMENT \
-e PR_NUMBER \
-e GITHUB_ACTIONS \
-e BASE_SHA \
-e BRANCH \
-e SHA1 \
-e AWS_DEFAULT_REGION \
-e IN_WHEEL_TEST \
-e SHARD_NUMBER \
-e TEST_CONFIG \
-e NUM_TEST_SHARDS \
-e PR_BODY \
-e COMMIT_MESSAGES \
-e PYTORCH_RETRY_TEST_CASES \
-e PYTORCH_OVERRIDE_FLAKY_SIGNAL \
-e PR_LABELS \
-e MAX_JOBS="$(nproc --ignore=2)" \
-e SCCACHE_BUCKET \
-e XLA_CUDA \
-e XLA_CLANG_CACHE_S3_BUCKET_NAME \
--env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
--ulimit stack=10485760:83886080 \
--security-opt seccomp=unconfined \
--cap-add=SYS_PTRACE \
--ipc=host \
--shm-size="${SHM_SIZE}" \
--tty \
--detach \
--name="${container_name}" \
--user jenkins \
-v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
-w /var/lib/jenkins/workspace \
"${DOCKER_IMAGE}"
)
docker exec -t "${container_name}" sh -c "pip install dist/*.whl && ${TEST_COMMAND}"
- name: Get workflow job id
id: get-job-id
uses: ./.github/actions/get-workflow-job-id
if: always()
with:
github-token: ${{ secrets.GITHUB_TOKEN }}
- name: Upload test artifacts
uses: ./.github/actions/upload-test-artifacts
if: always() && (steps.test.conclusion == 'success' || steps.test.conclusion == 'failure')
with:
file-suffix: ${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}_${{ steps.get-job-id.outputs.job-id }}
- name: Store Core dumps on S3
uses: seemethere/upload-artifact-s3@v5
if: failure()
with:
name: coredumps-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}
retention-days: 14
if-no-files-found: ignore
path: ./**/core.[1-9]*
- name: Upload test statistics
if: always()
env:
AWS_DEFAULT_REGION: us-east-1
GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
BRANCH: ${{ steps.parse-ref.outputs.branch }}
TEST_CONFIG: ${{ matrix.config }}
SHARD_NUMBER: ${{ matrix.shard }}
BUILD_ENVIRONMENT: ${{ inputs.build-environment }}
PR_NUMBER: ${{ github.event.pull_request.number }}
PYTORCH_RETRY_TEST_CASES: 1
PYTORCH_OVERRIDE_FLAKY_SIGNAL: 1
SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
TAG: ${{ steps.parse-ref.outputs.tag }}
WORKFLOW_ID: ${{ github.run_id }}
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
GHA_WORKFLOW_JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
shell: bash
run: |
set -x
python3 -m pip install -r requirements.txt
python3 -m pip install boto3==1.19.12
python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test
- name: Teardown Linux
uses: ./.github/actions/teardown-linux
if: always()