pytorch/.github/workflows/_linux-build.yml
Aleksei Nikiforov 127f836881 S390x cancelled jobs cleanup (#144149)
Sometimes job is cancelled during nested docker container creation.
This leads to nested docker container not being stopped and worker hanging forever in the job.
Improve nested docker containers cleanup for these cases.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/144149
Approved by: https://github.com/seemethere
2025-01-09 20:45:19 +00:00

342 lines
14 KiB
YAML

name: linux-build
on:
workflow_call:
inputs:
build-environment:
required: true
type: string
description: Top-level label for what's being built/tested.
docker-image-name:
required: true
type: string
description: Name of the base docker image to build with.
build-generates-artifacts:
required: false
type: boolean
default: true
description: If set, upload generated build artifacts.
build-with-debug:
required: false
type: boolean
default: false
description: If set, build in debug mode.
sync-tag:
required: false
type: string
default: ""
description: |
If this is set, our linter will use this to make sure that every other
job with the same `sync-tag` is identical.
cuda-arch-list:
required: false
type: string
default: "5.2"
description: |
List of CUDA architectures CI build should target.
runner_prefix:
required: false
default: ""
type: string
description: Prefix for runner label
runner:
required: false
type: string
default: "linux.2xlarge"
description: |
Label of the runner this job should run on.
test-matrix:
required: false
type: string
description: |
An option JSON description of what test configs to run later on. This
is moved here from the Linux test workflow so that we can apply filter
logic using test-config labels earlier and skip unnecessary builds
selected-test-configs:
description: |
A comma-separated list of test configurations from the test matrix to keep,
The empty list means we are going to keep every configurations by defaults
required: false
type: string
default: ""
s3-bucket:
description: S3 bucket to download artifact
required: false
type: string
default: "gha-artifacts"
aws-role-to-assume:
description: Role to assume for downloading artifacts
required: false
type: string
default: ""
use_split_build:
description: |
[Experimental] Build a libtorch only wheel and build pytorch such that
are built from the libtorch wheel.
required: false
type: boolean
default: false
secrets:
HUGGING_FACE_HUB_TOKEN:
required: false
description: |
HF Auth token to avoid rate limits when downloading models or datasets from hub
SCRIBE_GRAPHQL_ACCESS_TOKEN:
required: false
description: |
FB app token to write to scribe endpoint
outputs:
docker-image:
value: ${{ jobs.build.outputs.docker-image }}
description: The docker image containing the built PyTorch.
test-matrix:
value: ${{ jobs.build.outputs.test-matrix }}
description: An optional JSON description of what test configs to run later on.
jobs:
build:
environment: ${{ github.ref == 'refs/heads/main' && 'scribe-protected' || startsWith(github.ref, 'refs/heads/release/') && 'scribe-protected' || contains(github.event.pull_request.labels.*.name, 'ci-scribe') && 'scribe-pr' || '' }}
# Don't run on forked repos
if: github.repository_owner == 'pytorch'
runs-on: ${{ inputs.runner_prefix}}${{ inputs.runner }}
timeout-minutes: 240
outputs:
docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
test-matrix: ${{ steps.filter.outputs.test-matrix }}
steps:
- name: Setup SSH (Click me for login details)
uses: pytorch/test-infra/.github/actions/setup-ssh@main
if: inputs.build-environment != 'linux-s390x-binary-manywheel'
with:
github-secret: ${{ secrets.GITHUB_TOKEN }}
# [pytorch repo ref]
# Use a pytorch/pytorch reference instead of a reference to the local
# checkout because when we run this action we don't *have* a local
# checkout. In other cases you should prefer a local checkout.
- name: Checkout PyTorch
uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
with:
no-sudo: true
- name: Setup Linux
uses: ./.github/actions/setup-linux
if: inputs.build-environment != 'linux-s390x-binary-manywheel'
- name: configure aws credentials
uses: aws-actions/configure-aws-credentials@v3
if: ${{ inputs.aws-role-to-assume != '' && inputs.build-environment != 'linux-s390x-binary-manywheel' }}
with:
role-to-assume: ${{ inputs.aws-role-to-assume }}
role-session-name: gha-linux-build
aws-region: us-east-1
- name: Calculate docker image
id: calculate-docker-image
uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
if: inputs.build-environment != 'linux-s390x-binary-manywheel'
with:
docker-image-name: ${{ inputs.docker-image-name }}
- name: Use following to pull public copy of the image
id: print-ghcr-mirror
if: inputs.build-environment != 'linux-s390x-binary-manywheel'
env:
ECR_DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
shell: bash
run: |
tag=${ECR_DOCKER_IMAGE##*/}
echo "docker pull ghcr.io/pytorch/ci-image:${tag/:/-}"
- name: Pull docker image
uses: pytorch/test-infra/.github/actions/pull-docker-image@main
if: inputs.build-environment != 'linux-s390x-binary-manywheel'
with:
docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
- name: Parse ref
id: parse-ref
run: .github/scripts/parse_ref.py
- name: Get workflow job id
id: get-job-id
uses: ./.github/actions/get-workflow-job-id
if: always()
with:
github-token: ${{ secrets.GITHUB_TOKEN }}
# Apply the filter logic to the build step too if the test-config label is already there
- name: Select all requested test configurations (if the test matrix is available)
id: filter
uses: ./.github/actions/filter-test-configs
with:
github-token: ${{ secrets.GITHUB_TOKEN }}
test-matrix: ${{ inputs.test-matrix }}
selected-test-configs: ${{ inputs.selected-test-configs }}
job-name: ${{ steps.get-job-id.outputs.job-name }}
- name: Download pytest cache
uses: ./.github/actions/pytest-cache-download
continue-on-error: true
if: inputs.build-environment != 'linux-s390x-binary-manywheel'
with:
cache_dir: .pytest_cache
job_identifier: ${{ github.workflow }}_${{ inputs.build-environment }}
s3_bucket: ${{ inputs.s3-bucket }}
- name: Build
if: steps.filter.outputs.is-test-matrix-empty == 'False' || inputs.test-matrix == ''
id: build
env:
BUILD_ENVIRONMENT: ${{ inputs.build-environment }}
BRANCH: ${{ steps.parse-ref.outputs.branch }}
# TODO duplicated
AWS_DEFAULT_REGION: us-east-1
PR_NUMBER: ${{ github.event.pull_request.number }}
SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
SCCACHE_REGION: us-east-1
# Use the build environment here to make sure that all build jobs in the same environment
# will share the same cache regardless of which workflow they belong. This should improve
# the cache usage for jobs in non-pull workflows like periodic, slow, or inductor
SCCACHE_S3_KEY_PREFIX: ${{ inputs.build-environment || github.workflow }}
XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
TORCH_CUDA_ARCH_LIST: ${{ inputs.cuda-arch-list }}
DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
DOCKER_IMAGE_S390X: ${{ inputs.docker-image-name }}
XLA_CUDA: ${{ contains(inputs.build-environment, 'xla') && '0' || '' }}
DEBUG: ${{ inputs.build-with-debug && '1' || '0' }}
OUR_GITHUB_JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
USE_SPLIT_BUILD: ${{ inputs.use_split_build }}
run: |
START_TIME=$(date +%s)
if [[ ${BUILD_ENVIRONMENT} == *"s390x"* ]]; then
JENKINS_USER=
USED_IMAGE="${DOCKER_IMAGE_S390X}"
# ensure that docker container cleanly exits in 12 hours
# if for some reason cleanup action doesn't stop container
# when job is cancelled
DOCKER_SHELL_CMD="sleep 12h"
# since some steps are skipped on s390x, if they are necessary, run them here
env | grep '^GITHUB' >> "/tmp/github_env_${GITHUB_RUN_ID}"
env | grep '^CI' >> "/tmp/github_env_${GITHUB_RUN_ID}"
else
JENKINS_USER="--user jenkins"
USED_IMAGE="${DOCKER_IMAGE}"
DOCKER_SHELL_CMD=
fi
# Leaving 1GB for the runner and other things
TOTAL_AVAILABLE_MEMORY_IN_GB=$(awk '/MemTotal/ { printf "%.3f \n", $2/1024/1024 - 1 }' /proc/meminfo)
# https://docs.docker.com/engine/containers/resource_constraints/#--memory-swap-details, the 3GB swap
# comes from https://github.com/pytorch/test-infra/pull/6058
TOTAL_MEMORY_WITH_SWAP=$(("${TOTAL_AVAILABLE_MEMORY_IN_GB%.*}" + 3))
# detached container should get cleaned up by teardown_ec2_linux
# Used for JENKINS_USER and DOCKER_SHELL_CMD, which can be empty
# shellcheck disable=SC2086
container_name=$(docker run \
-e BUILD_ENVIRONMENT \
-e MAX_JOBS="$(nproc --ignore=2)" \
-e AWS_DEFAULT_REGION \
-e PR_NUMBER \
-e SHA1 \
-e BRANCH \
-e SCCACHE_BUCKET \
-e SCCACHE_REGION \
-e SCCACHE_S3_KEY_PREFIX \
-e XLA_CUDA \
-e XLA_CLANG_CACHE_S3_BUCKET_NAME \
-e SKIP_SCCACHE_INITIALIZATION=1 \
-e TORCH_CUDA_ARCH_LIST \
-e PR_LABELS \
-e OUR_GITHUB_JOB_ID \
-e HUGGING_FACE_HUB_TOKEN \
-e SCRIBE_GRAPHQL_ACCESS_TOKEN \
-e USE_SPLIT_BUILD \
--memory="${TOTAL_AVAILABLE_MEMORY_IN_GB%.*}g" \
--memory-swap="${TOTAL_MEMORY_WITH_SWAP}g" \
--env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
--security-opt seccomp=unconfined \
--cap-add=SYS_PTRACE \
--tty \
--detach \
${JENKINS_USER} \
-v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
-w /var/lib/jenkins/workspace \
"${USED_IMAGE}" \
${DOCKER_SHELL_CMD}
)
docker exec -t "${container_name}" sh -c '.ci/pytorch/build.sh'
END_TIME=$(date +%s)
echo "build_time=$((END_TIME - START_TIME))" >> "$GITHUB_OUTPUT"
- name: Archive artifacts into zip
if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped'
run: |
zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .additional_ci_files
- name: Store PyTorch Build Artifacts on S3
uses: seemethere/upload-artifact-s3@v5
if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped' && !inputs.use_split_build && inputs.build-environment != 'linux-s390x-binary-manywheel'
with:
name: ${{ inputs.build-environment }}
retention-days: 14
if-no-files-found: error
path: artifacts.zip
s3-bucket: ${{ inputs.s3-bucket }}
- name: Store PyTorch Build Artifacts on S3 for split build
uses: seemethere/upload-artifact-s3@v5
if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped' && inputs.use_split_build && inputs.build-environment != 'linux-s390x-binary-manywheel'
with:
name: ${{ inputs.build-environment }}-experimental-split-build
retention-days: 14
if-no-files-found: error
path: artifacts.zip
s3-bucket: ${{ inputs.s3-bucket }}
- name: Store PyTorch Build Artifacts for s390x
uses: actions/upload-artifact@v4
if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped' && !inputs.use_split_build && inputs.build-environment == 'linux-s390x-binary-manywheel'
with:
name: ${{ inputs.build-environment }}
retention-days: 14
if-no-files-found: error
path: artifacts.zip
- name: Store PyTorch Build Artifacts for s390x for split build
uses: actions/upload-artifact@v4
if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped' && inputs.use_split_build && inputs.build-environment == 'linux-s390x-binary-manywheel'
with:
name: ${{ inputs.build-environment }}-experimental-split-build
retention-days: 14
if-no-files-found: error
path: artifacts.zip
- name: Upload sccache stats
if: steps.build.outcome != 'skipped' && inputs.build-environment != 'linux-s390x-binary-manywheel'
uses: ./.github/actions/upload-sccache-stats
with:
github-token: ${{ secrets.GITHUB_TOKEN }}
build-time: ${{ steps.build.outputs.build_time }}
- name: Teardown Linux
uses: pytorch/test-infra/.github/actions/teardown-linux@main
if: always() && inputs.build-environment != 'linux-s390x-binary-manywheel'
- name: Cleanup docker
if: always() && inputs.build-environment == 'linux-s390x-binary-manywheel'
shell: bash
run: |
# on s390x stop the container for clean worker stop
docker stop -a || true
docker kill -a || true