mirror of
https://github.com/saymrwulf/pytorch.git
synced 2026-05-14 20:57:59 +00:00
Rerun all disabled test to gather their latest result so that we can close disabled tickets automatically. When running under this mode (RERUN_DISABLED_TESTS=true), only disabled tests are run while the rest are skipped `<skipped message="Test is enabled but --rerun-disabled-tests verification mode is set, so only disabled tests are run" type="skip"/>`
The logic is roughly as follows, the test runs multiple times (n=50)
* If the disabled test passes, and it's flaky, do nothing because it's still flaky. In the test report, we'll see the test passes with the following skipped message:
```
<testcase classname="TestMultiprocessing" file="test_multiprocessing.py" line="357" name="test_fs" time="0.000" timestamp="0001-01-01T00:00:00">
<skipped message="{"flaky": True, "num_red": 4, "num_green": 0, "max_num_retries": 3, "rerun_disabled_test": true}" type="skip"/>
</testcase>
```
* If the disabled test passes every single time, and it is not flaky anymore, mark it so that it can be closed later. We will see the test runs and passes, i.e.
```
<testcase classname="TestCommonCUDA" name="test_out_warning_linalg_lu_factor_cuda" time="0.170" file="test_ops.py" />
```
* If the disabled test fails after all retries, this is also expected. So only report this but don't fail the job (because we don't care about red signals here), we'll see the test is skipped (without the `flaky` field), i.e.
```
<testcase classname="TestMultiprocessing" file="test_multiprocessing.py" line="357" name="test_fs" time="0.000" timestamp="0001-01-01T00:00:00">
<skipped message="{"num_red": 4, "num_green": 0, "max_num_retries": 3, "rerun_disabled_test": true}" type="skip"/>
</testcase>
```
This runs at the same schedule as `mem_leak_check` (daily). The change to update test stats, and (potentially) grouping on HUD will come in separated PRs.
### Testing
* pull https://github.com/pytorch/pytorch/actions/runs/3447434434
* trunk https://github.com/pytorch/pytorch/actions/runs/3447434928
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88646
Approved by: https://github.com/clee2000
244 lines
9.5 KiB
YAML
244 lines
9.5 KiB
YAML
# TODO: this looks sort of similar to _linux-test, but there are like a dozen
|
|
# places where you would have to insert an if statement. Probably it's better to
|
|
# just use a different workflow altogether
|
|
|
|
name: test
|
|
|
|
on:
|
|
workflow_call:
|
|
inputs:
|
|
build-environment:
|
|
required: true
|
|
type: string
|
|
description: Top-level label for what's being built/tested.
|
|
test-matrix:
|
|
required: true
|
|
type: string
|
|
description: JSON description of what test configs to run.
|
|
docker-image:
|
|
required: true
|
|
type: string
|
|
description: Docker image to run in.
|
|
sync-tag:
|
|
required: false
|
|
type: string
|
|
default: ""
|
|
description: |
|
|
If this is set, our linter will use this to make sure that every other
|
|
job with the same `sync-tag` is identical.
|
|
|
|
secrets:
|
|
AWS_OSSCI_METRICS_V2_ACCESS_KEY_ID:
|
|
required: true
|
|
description: access key id for test stats upload
|
|
AWS_OSSCI_METRICS_V2_SECRET_ACCESS_KEY:
|
|
required: true
|
|
description: secret acess key for test stats upload
|
|
|
|
env:
|
|
GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
|
|
|
|
jobs:
|
|
test:
|
|
# Don't run on forked repos.
|
|
if: github.repository_owner == 'pytorch'
|
|
timeout-minutes: 300
|
|
strategy:
|
|
matrix: ${{ fromJSON(inputs.test-matrix) }}
|
|
fail-fast: false
|
|
runs-on: ${{ matrix.runner }}
|
|
steps:
|
|
# [see note: pytorch repo ref]
|
|
- name: Checkout PyTorch
|
|
uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
|
|
with:
|
|
no-sudo: true
|
|
|
|
- name: Setup ROCm
|
|
uses: ./.github/actions/setup-rocm
|
|
|
|
- name: Pull docker image
|
|
uses: pytorch/test-infra/.github/actions/pull-docker-image@main
|
|
with:
|
|
docker-image: ${{ inputs.docker-image }}
|
|
|
|
- name: Start monitoring script
|
|
id: monitor-script
|
|
shell: bash
|
|
run: |
|
|
python3 -m pip install psutil==5.9.1
|
|
python3 -m pip install pynvml==11.4.1
|
|
python3 -m tools.stats.monitor > usage_log.txt 2>&1 &
|
|
echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}"
|
|
|
|
- name: Download build artifacts
|
|
uses: ./.github/actions/download-build-artifacts
|
|
with:
|
|
name: ${{ inputs.build-environment }}
|
|
|
|
- name: Parse ref
|
|
id: parse-ref
|
|
run: .github/scripts/parse_ref.py
|
|
|
|
- name: Test
|
|
id: test
|
|
env:
|
|
BUILD_ENVIRONMENT: ${{ inputs.build-environment }}
|
|
PR_NUMBER: ${{ github.event.pull_request.number }}
|
|
BRANCH: ${{ steps.parse-ref.outputs.branch }}
|
|
SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
|
|
PYTORCH_RETRY_TEST_CASES: 1
|
|
PYTORCH_OVERRIDE_FLAKY_SIGNAL: 1
|
|
TEST_CONFIG: ${{ matrix.config }}
|
|
SHARD_NUMBER: ${{ matrix.shard }}
|
|
NUM_TEST_SHARDS: ${{ matrix.num_shards }}
|
|
PR_BODY: ${{ github.event.pull_request.body }}
|
|
SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
|
|
DOCKER_IMAGE: ${{ inputs.docker-image }}
|
|
XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
|
|
PYTORCH_JIT_ENABLE_NVFUSER: 1
|
|
PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0' }}
|
|
PYTORCH_TEST_RERUN_DISABLED_TESTS: ${{ matrix.rerun_disabled_tests && '1' || '0' }}
|
|
timeout-minutes: 270
|
|
run: |
|
|
set -x
|
|
|
|
if [[ $TEST_CONFIG == 'multigpu' ]]; then
|
|
TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh
|
|
elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then
|
|
TEST_COMMAND=.jenkins/caffe2/test.sh
|
|
else
|
|
TEST_COMMAND=.jenkins/pytorch/test.sh
|
|
fi
|
|
|
|
COMMIT_MESSAGES=$(git cherry -v "origin/${GIT_DEFAULT_BRANCH:-master}")
|
|
|
|
# sanitize the input commit message and PR body here:
|
|
#
|
|
# trim all new lines from commit messages + PR_BODY to avoid issues with batch environment
|
|
# variable copying. see https://github.com/pytorch/pytorch/pull/80043#issuecomment-1167796028
|
|
COMMIT_MESSAGES="${COMMIT_MESSAGES//[$'\n\r']}"
|
|
PR_BODY="${PR_BODY//[$'\n\r']}"
|
|
|
|
# then trim all special characters like single and double quotes to avoid unescaped inputs to
|
|
# wreak havoc internally
|
|
export COMMIT_MESSAGES="${COMMIT_MESSAGES//[\'\"]}"
|
|
export PR_BODY="${PR_BODY//[\'\"]}"
|
|
|
|
# detached container should get cleaned up by teardown_ec2_linux
|
|
# TODO: Stop building test binaries as part of the build phase
|
|
# Used for GPU_FLAG since that doesn't play nice
|
|
# shellcheck disable=SC2086,SC2090
|
|
container_name=$(docker run \
|
|
${GPU_FLAG:-} \
|
|
-e BUILD_ENVIRONMENT \
|
|
-e PR_NUMBER \
|
|
-e GITHUB_ACTIONS \
|
|
-e BRANCH \
|
|
-e SHA1 \
|
|
-e AWS_DEFAULT_REGION \
|
|
-e IN_WHEEL_TEST \
|
|
-e SHARD_NUMBER \
|
|
-e TEST_CONFIG \
|
|
-e NUM_TEST_SHARDS \
|
|
-e PR_BODY \
|
|
-e COMMIT_MESSAGES \
|
|
-e PYTORCH_RETRY_TEST_CASES \
|
|
-e PYTORCH_OVERRIDE_FLAKY_SIGNAL \
|
|
-e MAX_JOBS="$(nproc --ignore=2)" \
|
|
-e SCCACHE_BUCKET \
|
|
-e XLA_CLANG_CACHE_S3_BUCKET_NAME \
|
|
-e PYTORCH_TEST_CUDA_MEM_LEAK_CHECK \
|
|
-e PYTORCH_TEST_RERUN_DISABLED_TESTS \
|
|
--env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
|
|
--ulimit stack=10485760:83886080 \
|
|
--security-opt seccomp=unconfined \
|
|
--cap-add=SYS_PTRACE \
|
|
--shm-size="8g" \
|
|
--tty \
|
|
--detach \
|
|
--name="${container_name}" \
|
|
--user jenkins \
|
|
-v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
|
|
-w /var/lib/jenkins/workspace \
|
|
"${DOCKER_IMAGE}"
|
|
)
|
|
# save container name for later step
|
|
echo "CONTAINER_NAME=${container_name}" >> "$GITHUB_ENV"
|
|
# jenkins user does not have write permission to mounted workspace; work-around by copying within container to jenkins home
|
|
docker exec -t "${container_name}" sh -c "cd .. && cp -R workspace pytorch && cd pytorch && pip install dist/*.whl && ${TEST_COMMAND}"
|
|
|
|
- name: Save test results
|
|
if: always()
|
|
run: |
|
|
# copy test results back to the mounted workspace, needed sudo, resulting permissions were correct
|
|
docker exec -t "${{ env.CONTAINER_NAME }}" sh -c "cd ../pytorch && sudo cp -R test/test-reports ../workspace/test"
|
|
|
|
- name: Get workflow job id
|
|
id: get-job-id
|
|
uses: ./.github/actions/get-workflow-job-id
|
|
if: always()
|
|
with:
|
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
|
|
|
- name: Stop monitoring script
|
|
if: always() && steps.monitor-script.outputs.monitor-script-pid
|
|
shell: bash
|
|
continue-on-error: true
|
|
env:
|
|
MONITOR_SCRIPT_PID: ${{ steps.monitor-script.outputs.monitor-script-pid }}
|
|
run: |
|
|
kill "$MONITOR_SCRIPT_PID"
|
|
|
|
- name: Upload test artifacts
|
|
uses: ./.github/actions/upload-test-artifacts
|
|
if: always() && (steps.test.conclusion == 'success' || steps.test.conclusion == 'failure')
|
|
with:
|
|
use-gha: true
|
|
file-suffix: ${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}_${{ steps.get-job-id.outputs.job-id }}
|
|
|
|
- name: Upload test statistics
|
|
if: always()
|
|
env:
|
|
AWS_DEFAULT_REGION: us-east-1
|
|
GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
|
|
BRANCH: ${{ steps.parse-ref.outputs.branch }}
|
|
TEST_CONFIG: ${{ matrix.config }}
|
|
SHARD_NUMBER: ${{ matrix.shard }}
|
|
BUILD_ENVIRONMENT: ${{ inputs.build-environment }}
|
|
PR_NUMBER: ${{ github.event.pull_request.number }}
|
|
PYTORCH_RETRY_TEST_CASES: 1
|
|
PYTORCH_OVERRIDE_FLAKY_SIGNAL: 1
|
|
SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
|
|
TAG: ${{ steps.parse-ref.outputs.tag }}
|
|
WORKFLOW_ID: ${{ github.run_id }}
|
|
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
|
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_OSSCI_METRICS_V2_ACCESS_KEY_ID }}
|
|
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_OSSCI_METRICS_V2_SECRET_ACCESS_KEY }}
|
|
GHA_WORKFLOW_JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
|
|
shell: bash
|
|
run: |
|
|
set -x
|
|
python3 -m pip install -r requirements.txt
|
|
python3 -m pip install boto3==1.19.12
|
|
python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test
|
|
|
|
- name: Teardown ROCm
|
|
if: always()
|
|
shell: bash
|
|
run: |
|
|
# Only stop the docker container we started since there might be multiple runners on this host.
|
|
docker stop "${{ env.CONTAINER_NAME }}" || true
|
|
# Prune all of the docker containers.
|
|
# Might fail if a prune is already in progress by another runner.
|
|
docker container prune -f || true
|
|
# Prune everything docker if there are more than 10 images (~200GB).
|
|
# This is easier than using a time filter, e.g., "until=24h".
|
|
# Might fail if a prune is already in progress by another runner.
|
|
image_count=$(docker images | wc -l)
|
|
if [[ ${image_count} -gt 10 ]]; then
|
|
echo "Purging all docker caches"
|
|
docker system prune -af || true
|
|
else
|
|
echo "Will not purge docker, only ${image_count} images found"
|
|
fi
|