pytorch/.github/workflows/_linux-test.yml
Xuehai Pan 4b0e2e2cc6 Use official NVML Python bindings (#93925)
Use the official NVML Python binding package [`nvidia-ml-py`](https://pypi.org/project/nvidia-ml-py), which is maintained by the NVIDIA NVML team.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/93925
Approved by: https://github.com/huydhn, https://github.com/ZainRizvi, https://github.com/ptrblck
2023-02-07 05:27:36 +00:00

265 lines
9.9 KiB
YAML

name: linux-test
on:
workflow_call:
inputs:
build-environment:
required: true
type: string
description: Top-level label for what's being built/tested.
test-matrix:
required: true
type: string
description: JSON description of what test configs to run.
docker-image:
required: true
type: string
description: Docker image to run in.
sync-tag:
required: false
type: string
default: ""
description: |
If this is set, our linter will use this to make sure that every other
job with the same `sync-tag` is identical.
timeout-minutes:
required: false
type: number
default: 240
description: |
Set the maximum (in minutes) how long the workflow should take to finish
use-gha:
required: false
type: string
default: ""
description: If set to any value, upload to GHA. Otherwise upload to S3.
env:
GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
jobs:
# This needs to be run right before the test starts so that it can gather the
# latest labels from the PR
filter:
runs-on: [self-hosted, linux.large]
outputs:
test-matrix: ${{ steps.filter.outputs.test-matrix }}
is-test-matrix-empty: ${{ steps.filter.outputs.is-test-matrix-empty }}
keep-going: ${{ steps.filter.outputs.keep-going }}
steps:
- name: Checkout PyTorch
uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
with:
fetch-depth: 1
submodules: false
- name: Select all requested test configurations
id: filter
uses: ./.github/actions/filter-test-configs
with:
github-token: ${{ secrets.GITHUB_TOKEN }}
test-matrix: ${{ inputs.test-matrix }}
test:
needs: filter
# Don't run on forked repos or empty test matrix
if: github.repository_owner == 'pytorch' && needs.filter.outputs.is-test-matrix-empty == 'False'
strategy:
matrix: ${{ fromJSON(needs.filter.outputs.test-matrix) }}
fail-fast: false
runs-on: ${{ matrix.runner }}
timeout-minutes: ${{ inputs.timeout-minutes }}
steps:
- name: Setup SSH (Click me for login details)
uses: pytorch/test-infra/.github/actions/setup-ssh@main
with:
github-secret: ${{ secrets.GITHUB_TOKEN }}
instructions: |
All testing is done inside the container, to start an interactive session run:
docker exec -it $(docker container ps --format '{{.ID}}') bash
- name: Checkout PyTorch
uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
- name: Setup Linux
uses: ./.github/actions/setup-linux
- name: Pull docker image
uses: pytorch/test-infra/.github/actions/pull-docker-image@main
with:
docker-image: ${{ inputs.docker-image }}
- name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
uses: pytorch/test-infra/.github/actions/setup-nvidia@main
if: contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu')
- name: Start monitoring script
id: monitor-script
shell: bash
continue-on-error: true
run: |
python3 -m pip install psutil==5.9.1 nvidia-ml-py==11.525.84
python3 -m tools.stats.monitor > usage_log.txt 2>&1 &
echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}"
- name: Download build artifacts
uses: ./.github/actions/download-build-artifacts
with:
name: ${{ inputs.build-environment }}
- name: Parse ref
id: parse-ref
run: .github/scripts/parse_ref.py
- name: Set Test step time
id: test-timeout
shell: bash
env:
JOB_TIMEOUT: ${{ inputs.timeout-minutes }}
run: |
echo "timeout=$((JOB_TIMEOUT-30))" >> "${GITHUB_OUTPUT}"
- name: Test
id: test
timeout-minutes: ${{ fromJson(steps.test-timeout.outputs.timeout) }}
env:
BUILD_ENVIRONMENT: ${{ inputs.build-environment }}
PR_NUMBER: ${{ github.event.pull_request.number }}
BRANCH: ${{ steps.parse-ref.outputs.branch }}
SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
BASE_SHA: ${{ github.event.pull_request.base.sha || github.sha }}
PYTORCH_RETRY_TEST_CASES: 1
PYTORCH_OVERRIDE_FLAKY_SIGNAL: 1
TEST_CONFIG: ${{ matrix.config }}
SHARD_NUMBER: ${{ matrix.shard }}
NUM_TEST_SHARDS: ${{ matrix.num_shards }}
PR_BODY: ${{ github.event.pull_request.body }}
CONTINUE_THROUGH_ERROR: ${{ needs.filter.outputs.keep-going }}
SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
SCCACHE_S3_KEY_PREFIX: ${{ github.workflow }}
SHM_SIZE: ${{ contains(inputs.build-environment, 'cuda') && '2g' || '1g' }}
DOCKER_IMAGE: ${{ inputs.docker-image }}
XLA_CUDA: ${{ contains(inputs.build-environment, 'xla') && '0' || '' }}
XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0' }}
PYTORCH_TEST_RERUN_DISABLED_TESTS: ${{ matrix.rerun_disabled_tests && '1' || '0' }}
run: |
set -x
if [[ $TEST_CONFIG == 'multigpu' ]]; then
TEST_COMMAND=.ci/pytorch/multigpu-test.sh
elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then
TEST_COMMAND=.ci/onnx/test.sh
else
TEST_COMMAND=.ci/pytorch/test.sh
fi
COMMIT_MESSAGES=$(git cherry -v "origin/${GIT_DEFAULT_BRANCH:-master}")
# sanitize the input commit message and PR body here:
#
# trim all new lines from commit messages + PR_BODY to avoid issues with batch environment
# variable copying. see https://github.com/pytorch/pytorch/pull/80043#issuecomment-1167796028
COMMIT_MESSAGES="${COMMIT_MESSAGES//[$'\n\r']}"
PR_BODY="${PR_BODY//[$'\n\r']}"
# then trim all special characters like single and double quotes to avoid unescaped inputs to
# wreak havoc internally
export COMMIT_MESSAGES="${COMMIT_MESSAGES//[\'\"]}"
export PR_BODY="${PR_BODY//[\'\"]}"
# detached container should get cleaned up by teardown_ec2_linux
# TODO: Stop building test binaries as part of the build phase
# Used for GPU_FLAG since that doesn't play nice
# shellcheck disable=SC2086,SC2090
container_name=$(docker run \
${GPU_FLAG:-} \
-e BUILD_ENVIRONMENT \
-e PR_NUMBER \
-e GITHUB_ACTIONS \
-e BASE_SHA \
-e BRANCH \
-e SHA1 \
-e AWS_DEFAULT_REGION \
-e IN_WHEEL_TEST \
-e SHARD_NUMBER \
-e TEST_CONFIG \
-e NUM_TEST_SHARDS \
-e PR_BODY \
-e COMMIT_MESSAGES \
-e CONTINUE_THROUGH_ERROR \
-e PYTORCH_RETRY_TEST_CASES \
-e PYTORCH_OVERRIDE_FLAKY_SIGNAL \
-e PR_LABELS \
-e MAX_JOBS="$(nproc --ignore=2)" \
-e SCCACHE_BUCKET \
-e SCCACHE_S3_KEY_PREFIX \
-e XLA_CUDA \
-e XLA_CLANG_CACHE_S3_BUCKET_NAME \
-e PYTORCH_TEST_CUDA_MEM_LEAK_CHECK \
-e PYTORCH_TEST_RERUN_DISABLED_TESTS \
-e SKIP_SCCACHE_INITIALIZATION=1 \
--env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
--ulimit stack=10485760:83886080 \
--security-opt seccomp=unconfined \
--cap-add=SYS_PTRACE \
--ipc=host \
--shm-size="${SHM_SIZE}" \
--tty \
--detach \
--name="${container_name}" \
--user jenkins \
-v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
-w /var/lib/jenkins/workspace \
"${DOCKER_IMAGE}"
)
echo "DOCKER_CONTAINER_ID=${container_name}" >> "${GITHUB_ENV}"
docker exec -t "${container_name}" sh -c "pip install $(echo dist/*.whl)[opt-einsum] && ${TEST_COMMAND}"
- name: Print remaining test logs
shell: bash
if: always()
run: |
cat test/**/*.log || true
- name: Get workflow job id
id: get-job-id
uses: ./.github/actions/get-workflow-job-id
if: always()
with:
github-token: ${{ secrets.GITHUB_TOKEN }}
- name: Stop monitoring script
if: always() && steps.monitor-script.outputs.monitor-script-pid
shell: bash
continue-on-error: true
env:
MONITOR_SCRIPT_PID: ${{ steps.monitor-script.outputs.monitor-script-pid }}
run: |
kill "$MONITOR_SCRIPT_PID"
- name: Upload test artifacts
uses: ./.github/actions/upload-test-artifacts
if: always() && steps.test.conclusion && steps.test.conclusion != 'skipped'
with:
file-suffix: ${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}_${{ steps.get-job-id.outputs.job-id }}
use-gha: ${{ inputs.use-gha }}
- name: Collect backtraces from coredumps (if any)
if: always()
run: |
# shellcheck disable=SC2156
find . -iname "core.[1-9]*" -exec docker exec "${DOCKER_CONTAINER_ID}" sh -c "gdb python {} -ex 'bt' -ex 'q'" \;
- name: Store Core dumps on S3
uses: seemethere/upload-artifact-s3@v5
if: failure()
with:
name: coredumps-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}
retention-days: 14
if-no-files-found: ignore
path: ./**/core.[1-9]*
- name: Teardown Linux
uses: pytorch/test-infra/.github/actions/teardown-linux@main
if: always()