mirror of
https://github.com/saymrwulf/pytorch.git
synced 2026-05-15 21:00:47 +00:00
tbh at this point it might be easier to make a new workflow and copy the relevant jobs... Changes: * Disable cuda mem leak check except for on scheduled workflows * Make pull and trunk run on a schedule which will run the memory leak check * Periodic will always run the memory leak check -> periodic does not have parallelization anymore * Concurrency check changed to be slightly more generous Pull Request resolved: https://github.com/pytorch/pytorch/pull/88373 Approved by: https://github.com/ZainRizvi, https://github.com/huydhn
253 lines
9.5 KiB
YAML
253 lines
9.5 KiB
YAML
name: linux-test
|
|
|
|
on:
|
|
workflow_call:
|
|
inputs:
|
|
build-environment:
|
|
required: true
|
|
type: string
|
|
description: Top-level label for what's being built/tested.
|
|
test-matrix:
|
|
required: true
|
|
type: string
|
|
description: JSON description of what test configs to run.
|
|
docker-image:
|
|
required: true
|
|
type: string
|
|
description: Docker image to run in.
|
|
sync-tag:
|
|
required: false
|
|
type: string
|
|
default: ""
|
|
description: |
|
|
If this is set, our linter will use this to make sure that every other
|
|
job with the same `sync-tag` is identical.
|
|
|
|
env:
|
|
GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
|
|
|
|
jobs:
|
|
# This needs to be run right before the test starts so that it can gather the
|
|
# latest labels from the PR
|
|
filter:
|
|
runs-on: [self-hosted, linux.large]
|
|
outputs:
|
|
test-matrix: ${{ steps.filter.outputs.test-matrix }}
|
|
is-test-matrix-empty: ${{ steps.filter.outputs.is-test-matrix-empty }}
|
|
steps:
|
|
- name: Checkout PyTorch
|
|
uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
|
|
with:
|
|
fetch-depth: 1
|
|
submodules: false
|
|
|
|
- name: Select all requested test configurations
|
|
id: filter
|
|
uses: ./.github/actions/filter-test-configs
|
|
with:
|
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
|
test-matrix: ${{ inputs.test-matrix }}
|
|
|
|
test:
|
|
needs: filter
|
|
# Don't run on forked repos or empty test matrix
|
|
if: github.repository_owner == 'pytorch' && needs.filter.outputs.is-test-matrix-empty == 'False'
|
|
strategy:
|
|
matrix: ${{ fromJSON(needs.filter.outputs.test-matrix) }}
|
|
fail-fast: false
|
|
runs-on: ${{ matrix.runner }}
|
|
steps:
|
|
- name: Checkout PyTorch
|
|
uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
|
|
|
|
- name: Setup Linux
|
|
uses: ./.github/actions/setup-linux
|
|
|
|
- name: Setup SSH (Click me for login details)
|
|
uses: pytorch/test-infra/.github/actions/setup-ssh@main
|
|
with:
|
|
github-secret: ${{ secrets.GITHUB_TOKEN }}
|
|
|
|
- name: Pull docker image
|
|
uses: pytorch/test-infra/.github/actions/pull-docker-image@main
|
|
with:
|
|
docker-image: ${{ inputs.docker-image }}
|
|
|
|
- name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
|
|
uses: nick-fields/retry@3e91a01664abd3c5cd539100d10d33b9c5b68482
|
|
if: contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu')
|
|
with:
|
|
timeout_minutes: 10
|
|
max_attempts: 3
|
|
command: |
|
|
set -ex
|
|
bash .github/scripts/install_nvidia_utils_linux.sh
|
|
echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
|
|
|
|
- name: Start monitoring script
|
|
id: monitor-script
|
|
shell: bash
|
|
run: |
|
|
python3 -m pip install psutil==5.9.1
|
|
python3 -m pip install pynvml==11.4.1
|
|
python3 -m tools.stats.monitor > usage_log.txt 2>&1 &
|
|
echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}"
|
|
|
|
- name: Download build artifacts
|
|
uses: ./.github/actions/download-build-artifacts
|
|
with:
|
|
name: ${{ inputs.build-environment }}
|
|
|
|
- name: Parse ref
|
|
id: parse-ref
|
|
run: .github/scripts/parse_ref.py
|
|
|
|
- name: Test
|
|
id: test
|
|
env:
|
|
BUILD_ENVIRONMENT: ${{ inputs.build-environment }}
|
|
PR_NUMBER: ${{ github.event.pull_request.number }}
|
|
BRANCH: ${{ steps.parse-ref.outputs.branch }}
|
|
SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
|
|
BASE_SHA: ${{ github.event.pull_request.base.sha || github.sha }}
|
|
PYTORCH_RETRY_TEST_CASES: 1
|
|
PYTORCH_OVERRIDE_FLAKY_SIGNAL: 1
|
|
TEST_CONFIG: ${{ matrix.config }}
|
|
SHARD_NUMBER: ${{ matrix.shard }}
|
|
NUM_TEST_SHARDS: ${{ matrix.num_shards }}
|
|
PR_BODY: ${{ github.event.pull_request.body }}
|
|
SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
|
|
SCCACHE_S3_KEY_PREFIX: ${{ github.workflow }}
|
|
SHM_SIZE: ${{ contains(inputs.build-environment, 'cuda') && '2g' || '1g' }}
|
|
DOCKER_IMAGE: ${{ inputs.docker-image }}
|
|
XLA_CUDA: ${{ contains(inputs.build-environment, 'xla') && '0' || '' }}
|
|
XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
|
|
PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0'}}
|
|
timeout-minutes: 240
|
|
run: |
|
|
set -x
|
|
|
|
if [[ $TEST_CONFIG == 'multigpu' ]]; then
|
|
TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh
|
|
elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then
|
|
TEST_COMMAND=.jenkins/caffe2/test.sh
|
|
else
|
|
TEST_COMMAND=.jenkins/pytorch/test.sh
|
|
fi
|
|
|
|
COMMIT_MESSAGES=$(git cherry -v "origin/${GIT_DEFAULT_BRANCH:-master}")
|
|
|
|
# sanitize the input commit message and PR body here:
|
|
#
|
|
# trim all new lines from commit messages + PR_BODY to avoid issues with batch environment
|
|
# variable copying. see https://github.com/pytorch/pytorch/pull/80043#issuecomment-1167796028
|
|
COMMIT_MESSAGES="${COMMIT_MESSAGES//[$'\n\r']}"
|
|
PR_BODY="${PR_BODY//[$'\n\r']}"
|
|
|
|
# then trim all special characters like single and double quotes to avoid unescaped inputs to
|
|
# wreak havoc internally
|
|
export COMMIT_MESSAGES="${COMMIT_MESSAGES//[\'\"]}"
|
|
export PR_BODY="${PR_BODY//[\'\"]}"
|
|
|
|
# detached container should get cleaned up by teardown_ec2_linux
|
|
# TODO: Stop building test binaries as part of the build phase
|
|
# Used for GPU_FLAG since that doesn't play nice
|
|
# shellcheck disable=SC2086,SC2090
|
|
container_name=$(docker run \
|
|
${GPU_FLAG:-} \
|
|
-e BUILD_ENVIRONMENT \
|
|
-e PR_NUMBER \
|
|
-e GITHUB_ACTIONS \
|
|
-e BASE_SHA \
|
|
-e BRANCH \
|
|
-e SHA1 \
|
|
-e AWS_DEFAULT_REGION \
|
|
-e IN_WHEEL_TEST \
|
|
-e SHARD_NUMBER \
|
|
-e TEST_CONFIG \
|
|
-e NUM_TEST_SHARDS \
|
|
-e PR_BODY \
|
|
-e COMMIT_MESSAGES \
|
|
-e PYTORCH_RETRY_TEST_CASES \
|
|
-e PYTORCH_OVERRIDE_FLAKY_SIGNAL \
|
|
-e PR_LABELS \
|
|
-e MAX_JOBS="$(nproc --ignore=2)" \
|
|
-e SCCACHE_BUCKET \
|
|
-e SCCACHE_S3_KEY_PREFIX \
|
|
-e XLA_CUDA \
|
|
-e XLA_CLANG_CACHE_S3_BUCKET_NAME \
|
|
-e PYTORCH_TEST_CUDA_MEM_LEAK_CHECK \
|
|
--env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
|
|
--ulimit stack=10485760:83886080 \
|
|
--security-opt seccomp=unconfined \
|
|
--cap-add=SYS_PTRACE \
|
|
--ipc=host \
|
|
--shm-size="${SHM_SIZE}" \
|
|
--tty \
|
|
--detach \
|
|
--name="${container_name}" \
|
|
--user jenkins \
|
|
-v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
|
|
-w /var/lib/jenkins/workspace \
|
|
"${DOCKER_IMAGE}"
|
|
)
|
|
docker exec -t "${container_name}" sh -c "pip install $(echo dist/*.whl)[opt-einsum] && ${TEST_COMMAND}"
|
|
|
|
- name: Get workflow job id
|
|
id: get-job-id
|
|
uses: ./.github/actions/get-workflow-job-id
|
|
if: always()
|
|
with:
|
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
|
|
|
- name: Stop monitoring script
|
|
if: always() && steps.monitor-script.outputs.monitor-script-pid
|
|
shell: bash
|
|
continue-on-error: true
|
|
env:
|
|
MONITOR_SCRIPT_PID: ${{ steps.monitor-script.outputs.monitor-script-pid }}
|
|
run: |
|
|
kill "$MONITOR_SCRIPT_PID"
|
|
|
|
- name: Upload test artifacts
|
|
uses: ./.github/actions/upload-test-artifacts
|
|
if: always() && (steps.test.conclusion == 'success' || steps.test.conclusion == 'failure')
|
|
with:
|
|
file-suffix: ${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}_${{ steps.get-job-id.outputs.job-id }}
|
|
|
|
- name: Store Core dumps on S3
|
|
uses: seemethere/upload-artifact-s3@v5
|
|
if: failure()
|
|
with:
|
|
name: coredumps-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}
|
|
retention-days: 14
|
|
if-no-files-found: ignore
|
|
path: ./**/core.[1-9]*
|
|
|
|
- name: Upload test statistics
|
|
if: always()
|
|
env:
|
|
AWS_DEFAULT_REGION: us-east-1
|
|
GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
|
|
BRANCH: ${{ steps.parse-ref.outputs.branch }}
|
|
TEST_CONFIG: ${{ matrix.config }}
|
|
SHARD_NUMBER: ${{ matrix.shard }}
|
|
BUILD_ENVIRONMENT: ${{ inputs.build-environment }}
|
|
PR_NUMBER: ${{ github.event.pull_request.number }}
|
|
PYTORCH_RETRY_TEST_CASES: 1
|
|
PYTORCH_OVERRIDE_FLAKY_SIGNAL: 1
|
|
SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
|
|
TAG: ${{ steps.parse-ref.outputs.tag }}
|
|
WORKFLOW_ID: ${{ github.run_id }}
|
|
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
|
GHA_WORKFLOW_JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
|
|
shell: bash
|
|
run: |
|
|
set -x
|
|
python3 -m pip install -r requirements.txt
|
|
python3 -m pip install boto3==1.19.12
|
|
python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test
|
|
|
|
- name: Teardown Linux
|
|
uses: pytorch/test-infra/.github/actions/teardown-linux@main
|
|
if: always()
|