mirror of
https://github.com/saymrwulf/pytorch.git
synced 2026-05-14 20:57:59 +00:00
Sometimes job is cancelled during nested docker container creation. This leads to nested docker container not being stopped and worker hanging forever in the job. Improve nested docker containers cleanup for these cases. Pull Request resolved: https://github.com/pytorch/pytorch/pull/144149 Approved by: https://github.com/seemethere
342 lines
14 KiB
YAML
342 lines
14 KiB
YAML
name: linux-build
|
|
|
|
on:
|
|
workflow_call:
|
|
inputs:
|
|
build-environment:
|
|
required: true
|
|
type: string
|
|
description: Top-level label for what's being built/tested.
|
|
docker-image-name:
|
|
required: true
|
|
type: string
|
|
description: Name of the base docker image to build with.
|
|
build-generates-artifacts:
|
|
required: false
|
|
type: boolean
|
|
default: true
|
|
description: If set, upload generated build artifacts.
|
|
build-with-debug:
|
|
required: false
|
|
type: boolean
|
|
default: false
|
|
description: If set, build in debug mode.
|
|
sync-tag:
|
|
required: false
|
|
type: string
|
|
default: ""
|
|
description: |
|
|
If this is set, our linter will use this to make sure that every other
|
|
job with the same `sync-tag` is identical.
|
|
cuda-arch-list:
|
|
required: false
|
|
type: string
|
|
default: "5.2"
|
|
description: |
|
|
List of CUDA architectures CI build should target.
|
|
runner_prefix:
|
|
required: false
|
|
default: ""
|
|
type: string
|
|
description: Prefix for runner label
|
|
runner:
|
|
required: false
|
|
type: string
|
|
default: "linux.2xlarge"
|
|
description: |
|
|
Label of the runner this job should run on.
|
|
test-matrix:
|
|
required: false
|
|
type: string
|
|
description: |
|
|
An option JSON description of what test configs to run later on. This
|
|
is moved here from the Linux test workflow so that we can apply filter
|
|
logic using test-config labels earlier and skip unnecessary builds
|
|
selected-test-configs:
|
|
description: |
|
|
A comma-separated list of test configurations from the test matrix to keep,
|
|
The empty list means we are going to keep every configurations by defaults
|
|
required: false
|
|
type: string
|
|
default: ""
|
|
s3-bucket:
|
|
description: S3 bucket to download artifact
|
|
required: false
|
|
type: string
|
|
default: "gha-artifacts"
|
|
aws-role-to-assume:
|
|
description: Role to assume for downloading artifacts
|
|
required: false
|
|
type: string
|
|
default: ""
|
|
use_split_build:
|
|
description: |
|
|
[Experimental] Build a libtorch only wheel and build pytorch such that
|
|
are built from the libtorch wheel.
|
|
required: false
|
|
type: boolean
|
|
default: false
|
|
|
|
secrets:
|
|
HUGGING_FACE_HUB_TOKEN:
|
|
required: false
|
|
description: |
|
|
HF Auth token to avoid rate limits when downloading models or datasets from hub
|
|
SCRIBE_GRAPHQL_ACCESS_TOKEN:
|
|
required: false
|
|
description: |
|
|
FB app token to write to scribe endpoint
|
|
|
|
|
|
outputs:
|
|
docker-image:
|
|
value: ${{ jobs.build.outputs.docker-image }}
|
|
description: The docker image containing the built PyTorch.
|
|
test-matrix:
|
|
value: ${{ jobs.build.outputs.test-matrix }}
|
|
description: An optional JSON description of what test configs to run later on.
|
|
|
|
jobs:
|
|
build:
|
|
environment: ${{ github.ref == 'refs/heads/main' && 'scribe-protected' || startsWith(github.ref, 'refs/heads/release/') && 'scribe-protected' || contains(github.event.pull_request.labels.*.name, 'ci-scribe') && 'scribe-pr' || '' }}
|
|
# Don't run on forked repos
|
|
if: github.repository_owner == 'pytorch'
|
|
runs-on: ${{ inputs.runner_prefix}}${{ inputs.runner }}
|
|
timeout-minutes: 240
|
|
outputs:
|
|
docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
|
|
test-matrix: ${{ steps.filter.outputs.test-matrix }}
|
|
steps:
|
|
- name: Setup SSH (Click me for login details)
|
|
uses: pytorch/test-infra/.github/actions/setup-ssh@main
|
|
if: inputs.build-environment != 'linux-s390x-binary-manywheel'
|
|
with:
|
|
github-secret: ${{ secrets.GITHUB_TOKEN }}
|
|
|
|
# [pytorch repo ref]
|
|
# Use a pytorch/pytorch reference instead of a reference to the local
|
|
# checkout because when we run this action we don't *have* a local
|
|
# checkout. In other cases you should prefer a local checkout.
|
|
- name: Checkout PyTorch
|
|
uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
|
|
with:
|
|
no-sudo: true
|
|
|
|
- name: Setup Linux
|
|
uses: ./.github/actions/setup-linux
|
|
if: inputs.build-environment != 'linux-s390x-binary-manywheel'
|
|
|
|
- name: configure aws credentials
|
|
uses: aws-actions/configure-aws-credentials@v3
|
|
if: ${{ inputs.aws-role-to-assume != '' && inputs.build-environment != 'linux-s390x-binary-manywheel' }}
|
|
with:
|
|
role-to-assume: ${{ inputs.aws-role-to-assume }}
|
|
role-session-name: gha-linux-build
|
|
aws-region: us-east-1
|
|
|
|
- name: Calculate docker image
|
|
id: calculate-docker-image
|
|
uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
|
|
if: inputs.build-environment != 'linux-s390x-binary-manywheel'
|
|
with:
|
|
docker-image-name: ${{ inputs.docker-image-name }}
|
|
|
|
- name: Use following to pull public copy of the image
|
|
id: print-ghcr-mirror
|
|
if: inputs.build-environment != 'linux-s390x-binary-manywheel'
|
|
env:
|
|
ECR_DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
|
|
shell: bash
|
|
run: |
|
|
tag=${ECR_DOCKER_IMAGE##*/}
|
|
echo "docker pull ghcr.io/pytorch/ci-image:${tag/:/-}"
|
|
|
|
- name: Pull docker image
|
|
uses: pytorch/test-infra/.github/actions/pull-docker-image@main
|
|
if: inputs.build-environment != 'linux-s390x-binary-manywheel'
|
|
with:
|
|
docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
|
|
|
|
- name: Parse ref
|
|
id: parse-ref
|
|
run: .github/scripts/parse_ref.py
|
|
|
|
- name: Get workflow job id
|
|
id: get-job-id
|
|
uses: ./.github/actions/get-workflow-job-id
|
|
if: always()
|
|
with:
|
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
|
|
|
# Apply the filter logic to the build step too if the test-config label is already there
|
|
- name: Select all requested test configurations (if the test matrix is available)
|
|
id: filter
|
|
uses: ./.github/actions/filter-test-configs
|
|
with:
|
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
|
test-matrix: ${{ inputs.test-matrix }}
|
|
selected-test-configs: ${{ inputs.selected-test-configs }}
|
|
job-name: ${{ steps.get-job-id.outputs.job-name }}
|
|
|
|
- name: Download pytest cache
|
|
uses: ./.github/actions/pytest-cache-download
|
|
continue-on-error: true
|
|
if: inputs.build-environment != 'linux-s390x-binary-manywheel'
|
|
with:
|
|
cache_dir: .pytest_cache
|
|
job_identifier: ${{ github.workflow }}_${{ inputs.build-environment }}
|
|
s3_bucket: ${{ inputs.s3-bucket }}
|
|
|
|
- name: Build
|
|
if: steps.filter.outputs.is-test-matrix-empty == 'False' || inputs.test-matrix == ''
|
|
id: build
|
|
env:
|
|
BUILD_ENVIRONMENT: ${{ inputs.build-environment }}
|
|
BRANCH: ${{ steps.parse-ref.outputs.branch }}
|
|
# TODO duplicated
|
|
AWS_DEFAULT_REGION: us-east-1
|
|
PR_NUMBER: ${{ github.event.pull_request.number }}
|
|
SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
|
|
SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
|
|
SCCACHE_REGION: us-east-1
|
|
# Use the build environment here to make sure that all build jobs in the same environment
|
|
# will share the same cache regardless of which workflow they belong. This should improve
|
|
# the cache usage for jobs in non-pull workflows like periodic, slow, or inductor
|
|
SCCACHE_S3_KEY_PREFIX: ${{ inputs.build-environment || github.workflow }}
|
|
XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
|
|
PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
|
|
TORCH_CUDA_ARCH_LIST: ${{ inputs.cuda-arch-list }}
|
|
DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
|
|
DOCKER_IMAGE_S390X: ${{ inputs.docker-image-name }}
|
|
XLA_CUDA: ${{ contains(inputs.build-environment, 'xla') && '0' || '' }}
|
|
DEBUG: ${{ inputs.build-with-debug && '1' || '0' }}
|
|
OUR_GITHUB_JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
|
|
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
|
|
SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
|
|
USE_SPLIT_BUILD: ${{ inputs.use_split_build }}
|
|
run: |
|
|
START_TIME=$(date +%s)
|
|
if [[ ${BUILD_ENVIRONMENT} == *"s390x"* ]]; then
|
|
JENKINS_USER=
|
|
USED_IMAGE="${DOCKER_IMAGE_S390X}"
|
|
# ensure that docker container cleanly exits in 12 hours
|
|
# if for some reason cleanup action doesn't stop container
|
|
# when job is cancelled
|
|
DOCKER_SHELL_CMD="sleep 12h"
|
|
|
|
# since some steps are skipped on s390x, if they are necessary, run them here
|
|
env | grep '^GITHUB' >> "/tmp/github_env_${GITHUB_RUN_ID}"
|
|
env | grep '^CI' >> "/tmp/github_env_${GITHUB_RUN_ID}"
|
|
else
|
|
JENKINS_USER="--user jenkins"
|
|
USED_IMAGE="${DOCKER_IMAGE}"
|
|
DOCKER_SHELL_CMD=
|
|
fi
|
|
|
|
# Leaving 1GB for the runner and other things
|
|
TOTAL_AVAILABLE_MEMORY_IN_GB=$(awk '/MemTotal/ { printf "%.3f \n", $2/1024/1024 - 1 }' /proc/meminfo)
|
|
# https://docs.docker.com/engine/containers/resource_constraints/#--memory-swap-details, the 3GB swap
|
|
# comes from https://github.com/pytorch/test-infra/pull/6058
|
|
TOTAL_MEMORY_WITH_SWAP=$(("${TOTAL_AVAILABLE_MEMORY_IN_GB%.*}" + 3))
|
|
|
|
# detached container should get cleaned up by teardown_ec2_linux
|
|
# Used for JENKINS_USER and DOCKER_SHELL_CMD, which can be empty
|
|
# shellcheck disable=SC2086
|
|
container_name=$(docker run \
|
|
-e BUILD_ENVIRONMENT \
|
|
-e MAX_JOBS="$(nproc --ignore=2)" \
|
|
-e AWS_DEFAULT_REGION \
|
|
-e PR_NUMBER \
|
|
-e SHA1 \
|
|
-e BRANCH \
|
|
-e SCCACHE_BUCKET \
|
|
-e SCCACHE_REGION \
|
|
-e SCCACHE_S3_KEY_PREFIX \
|
|
-e XLA_CUDA \
|
|
-e XLA_CLANG_CACHE_S3_BUCKET_NAME \
|
|
-e SKIP_SCCACHE_INITIALIZATION=1 \
|
|
-e TORCH_CUDA_ARCH_LIST \
|
|
-e PR_LABELS \
|
|
-e OUR_GITHUB_JOB_ID \
|
|
-e HUGGING_FACE_HUB_TOKEN \
|
|
-e SCRIBE_GRAPHQL_ACCESS_TOKEN \
|
|
-e USE_SPLIT_BUILD \
|
|
--memory="${TOTAL_AVAILABLE_MEMORY_IN_GB%.*}g" \
|
|
--memory-swap="${TOTAL_MEMORY_WITH_SWAP}g" \
|
|
--env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
|
|
--security-opt seccomp=unconfined \
|
|
--cap-add=SYS_PTRACE \
|
|
--tty \
|
|
--detach \
|
|
${JENKINS_USER} \
|
|
-v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
|
|
-w /var/lib/jenkins/workspace \
|
|
"${USED_IMAGE}" \
|
|
${DOCKER_SHELL_CMD}
|
|
)
|
|
docker exec -t "${container_name}" sh -c '.ci/pytorch/build.sh'
|
|
|
|
END_TIME=$(date +%s)
|
|
echo "build_time=$((END_TIME - START_TIME))" >> "$GITHUB_OUTPUT"
|
|
|
|
- name: Archive artifacts into zip
|
|
if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped'
|
|
run: |
|
|
zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .additional_ci_files
|
|
|
|
- name: Store PyTorch Build Artifacts on S3
|
|
uses: seemethere/upload-artifact-s3@v5
|
|
if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped' && !inputs.use_split_build && inputs.build-environment != 'linux-s390x-binary-manywheel'
|
|
with:
|
|
name: ${{ inputs.build-environment }}
|
|
retention-days: 14
|
|
if-no-files-found: error
|
|
path: artifacts.zip
|
|
s3-bucket: ${{ inputs.s3-bucket }}
|
|
|
|
- name: Store PyTorch Build Artifacts on S3 for split build
|
|
uses: seemethere/upload-artifact-s3@v5
|
|
if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped' && inputs.use_split_build && inputs.build-environment != 'linux-s390x-binary-manywheel'
|
|
with:
|
|
name: ${{ inputs.build-environment }}-experimental-split-build
|
|
retention-days: 14
|
|
if-no-files-found: error
|
|
path: artifacts.zip
|
|
s3-bucket: ${{ inputs.s3-bucket }}
|
|
|
|
- name: Store PyTorch Build Artifacts for s390x
|
|
uses: actions/upload-artifact@v4
|
|
if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped' && !inputs.use_split_build && inputs.build-environment == 'linux-s390x-binary-manywheel'
|
|
with:
|
|
name: ${{ inputs.build-environment }}
|
|
retention-days: 14
|
|
if-no-files-found: error
|
|
path: artifacts.zip
|
|
|
|
- name: Store PyTorch Build Artifacts for s390x for split build
|
|
uses: actions/upload-artifact@v4
|
|
if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped' && inputs.use_split_build && inputs.build-environment == 'linux-s390x-binary-manywheel'
|
|
with:
|
|
name: ${{ inputs.build-environment }}-experimental-split-build
|
|
retention-days: 14
|
|
if-no-files-found: error
|
|
path: artifacts.zip
|
|
|
|
- name: Upload sccache stats
|
|
if: steps.build.outcome != 'skipped' && inputs.build-environment != 'linux-s390x-binary-manywheel'
|
|
uses: ./.github/actions/upload-sccache-stats
|
|
with:
|
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
|
build-time: ${{ steps.build.outputs.build_time }}
|
|
|
|
- name: Teardown Linux
|
|
uses: pytorch/test-infra/.github/actions/teardown-linux@main
|
|
if: always() && inputs.build-environment != 'linux-s390x-binary-manywheel'
|
|
|
|
- name: Cleanup docker
|
|
if: always() && inputs.build-environment == 'linux-s390x-binary-manywheel'
|
|
shell: bash
|
|
run: |
|
|
# on s390x stop the container for clean worker stop
|
|
docker stop -a || true
|
|
docker kill -a || true
|