mirror of
https://github.com/saymrwulf/pytorch.git
synced 2026-05-15 21:00:47 +00:00
I'm trying to make this benchmark results available on OSS benchmark database, so that people can query it from outside. The first step is to also record the results in the JSON format compatible with the database schema defined in https://github.com/pytorch/test-infra/pull/5839. Existing CSV files remain unchanged. ### Testing The JSON results are uploaded as artifacts to S3 https://github.com/pytorch/pytorch/actions/runs/11809725848/job/32901411180#step:26:13, for example https://gha-artifacts.s3.amazonaws.com/pytorch/pytorch/11809725848/1/artifact/test-jsons-test-pr_time_benchmarks-1-1-linux.g4dn.metal.nvidia.gpu_32901411180.zip Pull Request resolved: https://github.com/pytorch/pytorch/pull/140493 Approved by: https://github.com/laithsakka
458 lines
20 KiB
YAML
458 lines
20 KiB
YAML
name: linux-test
|
|
|
|
on:
|
|
workflow_call:
|
|
inputs:
|
|
build-environment:
|
|
required: true
|
|
type: string
|
|
description: Top-level label for what's being built/tested.
|
|
test-matrix:
|
|
required: true
|
|
type: string
|
|
description: JSON description of what test configs to run.
|
|
docker-image:
|
|
required: true
|
|
type: string
|
|
description: Docker image to run in.
|
|
sync-tag:
|
|
required: false
|
|
type: string
|
|
default: ""
|
|
description: |
|
|
If this is set, our linter will use this to make sure that every other
|
|
job with the same `sync-tag` is identical.
|
|
timeout-minutes:
|
|
required: false
|
|
type: number
|
|
default: 240
|
|
description: |
|
|
Set the maximum (in minutes) how long the workflow should take to finish
|
|
use-gha:
|
|
required: false
|
|
type: string
|
|
default: ""
|
|
description: If set to any value, upload to GHA. Otherwise upload to S3.
|
|
dashboard-tag:
|
|
required: false
|
|
type: string
|
|
default: ""
|
|
s3-bucket:
|
|
description: S3 bucket to download artifact
|
|
required: false
|
|
type: string
|
|
default: "gha-artifacts"
|
|
aws-role-to-assume:
|
|
description: role to assume for downloading artifacts
|
|
required: false
|
|
type: string
|
|
default: ""
|
|
disable-monitor:
|
|
description: |
|
|
[Experimental] Disable utilization monitoring for tests.
|
|
Currently, by default we disable the monitor job and only look for specific tests,
|
|
since we are investigating the behaviour of the monitor script with different tests.
|
|
required: false
|
|
type: boolean
|
|
default: true
|
|
secrets:
|
|
HUGGING_FACE_HUB_TOKEN:
|
|
required: false
|
|
description: |
|
|
HF Auth token to avoid rate limits when downloading models or datasets from hub
|
|
SCRIBE_GRAPHQL_ACCESS_TOKEN:
|
|
required: false
|
|
description: |
|
|
FB app token to write to scribe endpoint
|
|
|
|
env:
|
|
GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
|
|
|
|
jobs:
|
|
test:
|
|
# Don't run on forked repos or empty test matrix
|
|
if: github.repository_owner == 'pytorch' && toJSON(fromJSON(inputs.test-matrix).include) != '[]'
|
|
strategy:
|
|
matrix: ${{ fromJSON(inputs.test-matrix) }}
|
|
fail-fast: false
|
|
environment: ${{ github.ref == 'refs/heads/main' && 'scribe-protected' || startsWith(github.ref, 'refs/heads/release/') && 'scribe-protected' || contains(github.event.pull_request.labels.*.name, 'ci-scribe') && 'scribe-pr' || '' }}
|
|
runs-on: ${{ matrix.runner }}
|
|
timeout-minutes: ${{ matrix.mem_leak_check == 'mem_leak_check' && 600 || inputs.timeout-minutes }}
|
|
steps:
|
|
- name: Setup SSH (Click me for login details)
|
|
uses: pytorch/test-infra/.github/actions/setup-ssh@main
|
|
if: ${{ !contains(matrix.runner, 'gcp.a100') }}
|
|
with:
|
|
github-secret: ${{ secrets.GITHUB_TOKEN }}
|
|
instructions: |
|
|
All testing is done inside the container, to start an interactive session run:
|
|
docker exec -it $(docker container ps --format '{{.ID}}') bash
|
|
|
|
- name: Checkout PyTorch
|
|
uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
|
|
with:
|
|
no-sudo: true
|
|
|
|
- name: Setup Linux
|
|
uses: ./.github/actions/setup-linux
|
|
|
|
- name: configure aws credentials
|
|
if : ${{ inputs.aws-role-to-assume != '' }}
|
|
uses: aws-actions/configure-aws-credentials@v3
|
|
with:
|
|
role-to-assume: ${{ inputs.aws-role-to-assume }}
|
|
role-session-name: gha-linux-test
|
|
aws-region: us-east-1
|
|
|
|
- name: Calculate docker image
|
|
id: calculate-docker-image
|
|
uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
|
|
with:
|
|
docker-image-name: ${{ inputs.docker-image }}
|
|
|
|
- name: Use following to pull public copy of the image
|
|
id: print-ghcr-mirror
|
|
env:
|
|
ECR_DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
|
|
shell: bash
|
|
run: |
|
|
tag=${ECR_DOCKER_IMAGE##*/}
|
|
echo "docker pull ghcr.io/pytorch/ci-image:${tag/:/-}"
|
|
|
|
- name: Pull docker image
|
|
uses: pytorch/test-infra/.github/actions/pull-docker-image@main
|
|
with:
|
|
docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
|
|
|
|
- name: Check if in a container runner
|
|
shell: bash
|
|
id: check_container_runner
|
|
run: echo "IN_CONTAINER_RUNNER=$(if [ -f /.inarc ] || [ -f /.incontainer ]; then echo true ; else echo false; fi)" >> "$GITHUB_OUTPUT"
|
|
|
|
- name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
|
|
id: install-nvidia-driver
|
|
uses: pytorch/test-infra/.github/actions/setup-nvidia@main
|
|
if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' }}
|
|
|
|
- name: Setup GPU_FLAG for docker run
|
|
id: setup-gpu-flag
|
|
run: echo "GPU_FLAG=--gpus all -e NVIDIA_DRIVER_CAPABILITIES=all" >> "${GITHUB_ENV}"
|
|
if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'true' }}
|
|
|
|
- name: Setup SCCACHE_SERVER_PORT environment for docker run when on container
|
|
id: setup-sscache-port-flag
|
|
run: echo "SCCACHE_SERVER_PORT_DOCKER_FLAG=-e SCCACHE_SERVER_PORT=$((RUNNER_UID + 4226))" >> "${GITHUB_ENV}"
|
|
if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'true' }}
|
|
|
|
- name: Lock NVIDIA A100 40GB Frequency
|
|
run: |
|
|
sudo nvidia-smi -pm 1
|
|
sudo nvidia-smi -ac 1215,1410
|
|
nvidia-smi
|
|
if: ${{ contains(matrix.runner, 'a100') && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' }}
|
|
|
|
- name: Start monitoring script
|
|
id: monitor-script
|
|
if: ${{ !inputs.disable-monitor }}
|
|
shell: bash
|
|
continue-on-error: true
|
|
run: |
|
|
python3 -m pip install psutil==5.9.1 nvidia-ml-py==11.525.84
|
|
python3 -m tools.stats.monitor > usage_log.txt 2>&1 &
|
|
echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}"
|
|
|
|
- name: Download build artifacts
|
|
uses: ./.github/actions/download-build-artifacts
|
|
with:
|
|
name: ${{ inputs.build-environment }}
|
|
s3-bucket: ${{ inputs.s3-bucket }}
|
|
|
|
- name: Download TD artifacts
|
|
continue-on-error: true
|
|
uses: ./.github/actions/download-td-artifacts
|
|
|
|
- name: Parse ref
|
|
id: parse-ref
|
|
run: .github/scripts/parse_ref.py
|
|
|
|
- name: Get workflow job id
|
|
id: get-job-id
|
|
uses: ./.github/actions/get-workflow-job-id
|
|
if: always()
|
|
with:
|
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
|
|
|
- name: Check for keep-going label and re-enabled test issues
|
|
# This uses the filter-test-configs action because it conviniently
|
|
# checks for labels and re-enabled test issues. It does not actually do
|
|
# any filtering. All filtering is done in the build step.
|
|
id: keep-going
|
|
uses: ./.github/actions/filter-test-configs
|
|
with:
|
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
|
test-matrix: ${{ inputs.test-matrix }}
|
|
job-name: ${{ steps.get-job-id.outputs.job-name }}
|
|
|
|
- name: Set Test step time
|
|
id: test-timeout
|
|
shell: bash
|
|
env:
|
|
JOB_TIMEOUT: ${{ matrix.mem_leak_check == 'mem_leak_check' && 600 || inputs.timeout-minutes }}
|
|
run: |
|
|
echo "timeout=$((JOB_TIMEOUT-30))" >> "${GITHUB_OUTPUT}"
|
|
|
|
- name: Test
|
|
id: test
|
|
timeout-minutes: ${{ fromJson(steps.test-timeout.outputs.timeout) }}
|
|
env:
|
|
BUILD_ENVIRONMENT: ${{ inputs.build-environment }}
|
|
PR_NUMBER: ${{ github.event.pull_request.number }}
|
|
GITHUB_REPOSITORY: ${{ github.repository }}
|
|
GITHUB_WORKFLOW: ${{ github.workflow }}
|
|
GITHUB_JOB: ${{ github.job }}
|
|
GITHUB_RUN_ID: ${{ github.run_id }}
|
|
GITHUB_RUN_NUMBER: ${{ github.run_number }}
|
|
GITHUB_RUN_ATTEMPT: ${{ github.run_attempt }}
|
|
JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
|
|
JOB_NAME: ${{ steps.get-job-id.outputs.job-name }}
|
|
BRANCH: ${{ steps.parse-ref.outputs.branch }}
|
|
SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
|
|
BASE_SHA: ${{ github.event.pull_request.base.sha || github.sha }}
|
|
TEST_CONFIG: ${{ matrix.config }}
|
|
SHARD_NUMBER: ${{ matrix.shard }}
|
|
NUM_TEST_SHARDS: ${{ matrix.num_shards }}
|
|
REENABLED_ISSUES: ${{ steps.keep-going.outputs.reenabled-issues }}
|
|
CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }}
|
|
VERBOSE_TEST_LOGS: ${{ steps.keep-going.outputs.ci-verbose-test-logs }}
|
|
TEST_SHOWLOCALS: ${{ steps.keep-going.outputs.ci-test-showlocals }}
|
|
NO_TEST_TIMEOUT: ${{ steps.keep-going.outputs.ci-no-test-timeout }}
|
|
NO_TD: ${{ steps.keep-going.outputs.ci-no-td }}
|
|
TD_DISTRIBUTED: ${{ steps.keep-going.outputs.ci-td-distributed }}
|
|
SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
|
|
SCCACHE_REGION: us-east-1
|
|
SCCACHE_S3_KEY_PREFIX: ${{ github.workflow }}
|
|
SHM_SIZE: ${{ contains(inputs.build-environment, 'cuda') && '2g' || '1g' }}
|
|
DOCKER_IMAGE: ${{ inputs.docker-image }}
|
|
XLA_CUDA: ${{ contains(inputs.build-environment, 'xla') && '0' || '' }}
|
|
XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
|
|
PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0' }}
|
|
PYTORCH_TEST_RERUN_DISABLED_TESTS: ${{ matrix.rerun_disabled_tests && '1' || '0' }}
|
|
DASHBOARD_TAG: ${{ inputs.dashboard-tag }}
|
|
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
|
|
SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
|
|
IS_A100_RUNNER: ${{ contains(matrix.runner, 'a100') && '1' || '0' }}
|
|
ARTIFACTS_FILE_SUFFIX: ${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}_${{ steps.get-job-id.outputs.job-id }}
|
|
run: |
|
|
set -x
|
|
|
|
if [[ $TEST_CONFIG == 'multigpu' ]]; then
|
|
TEST_COMMAND=.ci/pytorch/multigpu-test.sh
|
|
elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then
|
|
TEST_COMMAND=.ci/onnx/test.sh
|
|
else
|
|
TEST_COMMAND=.ci/pytorch/test.sh
|
|
fi
|
|
|
|
# detached container should get cleaned up by teardown_ec2_linux
|
|
# TODO: Stop building test binaries as part of the build phase
|
|
# Used for GPU_FLAG since that doesn't play nice
|
|
# shellcheck disable=SC2086,SC2090
|
|
container_name=$(docker run \
|
|
${GPU_FLAG:-} \
|
|
${SCCACHE_SERVER_PORT_DOCKER_FLAG:-} \
|
|
-e BUILD_ENVIRONMENT \
|
|
-e PR_NUMBER \
|
|
-e GITHUB_ACTIONS \
|
|
-e GITHUB_REPOSITORY \
|
|
-e GITHUB_WORKFLOW \
|
|
-e GITHUB_JOB \
|
|
-e GITHUB_RUN_ID \
|
|
-e GITHUB_RUN_NUMBER \
|
|
-e GITHUB_RUN_ATTEMPT \
|
|
-e JOB_ID \
|
|
-e JOB_NAME \
|
|
-e BASE_SHA \
|
|
-e BRANCH \
|
|
-e SHA1 \
|
|
-e AWS_DEFAULT_REGION \
|
|
-e IN_WHEEL_TEST \
|
|
-e SHARD_NUMBER \
|
|
-e TEST_CONFIG \
|
|
-e NUM_TEST_SHARDS \
|
|
-e REENABLED_ISSUES \
|
|
-e CONTINUE_THROUGH_ERROR \
|
|
-e VERBOSE_TEST_LOGS \
|
|
-e TEST_SHOWLOCALS \
|
|
-e NO_TEST_TIMEOUT \
|
|
-e NO_TD \
|
|
-e TD_DISTRIBUTED \
|
|
-e PR_LABELS \
|
|
-e MAX_JOBS="$(nproc --ignore=2)" \
|
|
-e SCCACHE_BUCKET \
|
|
-e SCCACHE_REGION \
|
|
-e SCCACHE_S3_KEY_PREFIX \
|
|
-e XLA_CUDA \
|
|
-e XLA_CLANG_CACHE_S3_BUCKET_NAME \
|
|
-e PYTORCH_TEST_CUDA_MEM_LEAK_CHECK \
|
|
-e PYTORCH_TEST_RERUN_DISABLED_TESTS \
|
|
-e SKIP_SCCACHE_INITIALIZATION=1 \
|
|
-e HUGGING_FACE_HUB_TOKEN \
|
|
-e SCRIBE_GRAPHQL_ACCESS_TOKEN \
|
|
-e DASHBOARD_TAG \
|
|
-e IS_A100_RUNNER \
|
|
-e ARTIFACTS_FILE_SUFFIX \
|
|
--env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
|
|
--security-opt seccomp=unconfined \
|
|
--cap-add=SYS_PTRACE \
|
|
--ipc=host \
|
|
--shm-size="${SHM_SIZE}" \
|
|
--tty \
|
|
--detach \
|
|
--name="${container_name}" \
|
|
--user jenkins \
|
|
-v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
|
|
-w /var/lib/jenkins/workspace \
|
|
"${DOCKER_IMAGE}"
|
|
)
|
|
# Propagate download.pytorch.org IP to container
|
|
grep download.pytorch.org /etc/hosts | docker exec -i "${container_name}" sudo bash -c "/bin/cat >> /etc/hosts"
|
|
echo "DOCKER_CONTAINER_ID=${container_name}" >> "${GITHUB_ENV}"
|
|
docker exec -t "${container_name}" sh -c "python3 -m pip install $(echo dist/*.whl)[opt-einsum] && ${TEST_COMMAND}"
|
|
|
|
- name: Upload pytest cache if tests failed
|
|
uses: ./.github/actions/pytest-cache-upload
|
|
continue-on-error: true
|
|
if: failure() && steps.test.conclusion && steps.test.conclusion == 'failure'
|
|
with:
|
|
cache_dir: .pytest_cache
|
|
shard: ${{ matrix.shard }}
|
|
sha: ${{ github.event.pull_request.head.sha || github.sha }}
|
|
test_config: ${{ matrix.config }}
|
|
job_identifier: ${{ github.workflow }}_${{ inputs.build-environment }}
|
|
|
|
- name: Upload the benchmark results
|
|
uses: pytorch/test-infra/.github/actions/upload-benchmark-results@main
|
|
with:
|
|
benchmark-results-dir: test/test-reports
|
|
dry-run: false
|
|
schema-version: v3
|
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
|
|
|
- name: Print remaining test logs
|
|
shell: bash
|
|
if: always() && steps.test.conclusion
|
|
run: |
|
|
cat test/**/*_toprint.log || true
|
|
|
|
- name: Stop monitoring script
|
|
if: ${{ always() && steps.monitor-script.outputs.monitor-script-pid }}
|
|
shell: bash
|
|
continue-on-error: true
|
|
env:
|
|
MONITOR_SCRIPT_PID: ${{ steps.monitor-script.outputs.monitor-script-pid }}
|
|
run: |
|
|
kill "$MONITOR_SCRIPT_PID"
|
|
|
|
- name: Upload test artifacts
|
|
uses: ./.github/actions/upload-test-artifacts
|
|
if: always() && steps.test.conclusion && steps.test.conclusion != 'skipped'
|
|
with:
|
|
file-suffix: ${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}_${{ steps.get-job-id.outputs.job-id }}
|
|
use-gha: ${{ inputs.use-gha }}
|
|
s3-bucket: ${{ inputs.s3-bucket }}
|
|
|
|
- name: Collect backtraces from coredumps (if any)
|
|
if: always()
|
|
run: |
|
|
# shellcheck disable=SC2156
|
|
find . -iname "core.[1-9]*" -exec docker exec "${DOCKER_CONTAINER_ID}" sh -c "gdb python {} -ex 'bt' -ex 'q'" \;
|
|
|
|
- name: Store Core dumps on S3
|
|
uses: seemethere/upload-artifact-s3@v5
|
|
if: failure()
|
|
with:
|
|
name: coredumps-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}
|
|
retention-days: 14
|
|
if-no-files-found: ignore
|
|
path: ./**/core.[1-9]*
|
|
|
|
- name: Teardown Linux
|
|
uses: pytorch/test-infra/.github/actions/teardown-linux@main
|
|
if: always() && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false'
|
|
|
|
# NB: We are currently having an intermittent GPU-related issue on G5 runners with
|
|
# A10G GPU. Once this happens, trying to reset the GPU as done in setup-nvidia does
|
|
# not seem to help. Here are some symptoms:
|
|
# * Calling nvidia-smi timeouts after 60 second
|
|
# * Fail to run nvidia-smi with an unable to determine the device handle for GPU
|
|
# unknown error
|
|
# * Test fails with a missing CUDA GPU error when initializing CUDA in PyTorch
|
|
# * Run docker --gpus all fails with error response from daemon
|
|
#
|
|
# As both the root cause and recovery path are unclear, let's take the runner out of
|
|
# service so that it doesn't get any more jobs
|
|
- name: Check NVIDIA driver installation step
|
|
if: failure() && steps.install-nvidia-driver.outcome && steps.install-nvidia-driver.outcome != 'skipped'
|
|
shell: bash
|
|
env:
|
|
RUNNER_WORKSPACE: ${{ runner.workspace }}
|
|
run: |
|
|
set +e
|
|
set -x
|
|
|
|
nvidia-smi
|
|
# NB: Surprisingly, nvidia-smi command returns successfully with return code 0 even in
|
|
# the case where the driver has already crashed as it still can get the driver version
|
|
# and some basic information like the bus ID. However, the rest of the information
|
|
# would be missing (ERR!), for example:
|
|
#
|
|
# +-----------------------------------------------------------------------------+
|
|
# | NVIDIA-SMI 525.89.02 Driver Version: 525.89.02 CUDA Version: 12.0 |
|
|
# |-------------------------------+----------------------+----------------------+
|
|
# | GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
|
|
# | Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
|
|
# | | | MIG M. |
|
|
# |===============================+======================+======================|
|
|
# | 0 ERR! Off | 00000000:00:1E.0 Off | ERR! |
|
|
# |ERR! ERR! ERR! ERR! / ERR! | 4184MiB / 23028MiB | ERR! Default |
|
|
# | | | ERR! |
|
|
# +-------------------------------+----------------------+----------------------+
|
|
#
|
|
# +-----------------------------------------------------------------------------+
|
|
# | Processes: |
|
|
# | GPU GI CI PID Type Process name GPU Memory |
|
|
# | ID ID Usage |
|
|
# |=============================================================================|
|
|
# +-----------------------------------------------------------------------------+
|
|
#
|
|
# This should be reported as a failure instead as it will guarantee to fail when
|
|
# Docker tries to run with --gpus all
|
|
#
|
|
# So, the correct check here is to query one of the missing piece of info like
|
|
# GPU name, so that the command can fail accordingly
|
|
nvidia-smi --query-gpu=gpu_name --format=csv,noheader --id=0
|
|
NVIDIA_SMI_STATUS=$?
|
|
|
|
# These are acceptable return code from nvidia-smi as copied from setup-nvidia GitHub action
|
|
if [ "$NVIDIA_SMI_STATUS" -ne 0 ] && [ "$NVIDIA_SMI_STATUS" -ne 14 ]; then
|
|
echo "NVIDIA driver installation has failed, shutting down the runner..."
|
|
.github/scripts/stop_runner_service.sh
|
|
fi
|
|
|
|
# For runner with multiple GPUs, we also want to confirm that the number of GPUs are the
|
|
# power of 2, i.e. 1, 2, 4, or 8. This is to avoid flaky test issue when one GPU fails
|
|
# https://github.com/pytorch/test-infra/issues/4000
|
|
GPU_COUNT=$(nvidia-smi --list-gpus | wc -l)
|
|
NVIDIA_SMI_STATUS=$?
|
|
|
|
# These are acceptable return code from nvidia-smi as copied from setup-nvidia GitHub action
|
|
if [ "$NVIDIA_SMI_STATUS" -ne 0 ] && [ "$NVIDIA_SMI_STATUS" -ne 14 ]; then
|
|
echo "NVIDIA driver installation has failed, shutting down the runner..."
|
|
.github/scripts/stop_runner_service.sh
|
|
fi
|
|
|
|
# Check the GPU count to be a power of 2
|
|
if [ "$GPU_COUNT" -le 8 ] && [ "$GPU_COUNT" -ne 1 ] && [ "$GPU_COUNT" -ne 2 ] && [ "$GPU_COUNT" -ne 4 ] && [ "$GPU_COUNT" -ne 8 ]; then
|
|
echo "NVIDIA driver detects $GPU_COUNT GPUs. The runner has a broken GPU, shutting it down..."
|
|
.github/scripts/stop_runner_service.sh
|
|
fi
|