pytorch/.github/workflows/_linux-build.yml

name: linux-build

on:
  workflow_call:
    inputs:
      build-environment:
        required: true
        type: string
        description: Top-level label for what's being built/tested.
      docker-image-name:
        required: true
        type: string
        description: Name of the base docker image to build with.
      build-generates-artifacts:
        required: false
        type: boolean
        default: true
        description: If set, upload generated build artifacts.
      build-with-debug:
        required: false
        type: boolean
        default: false
        description: If set, build in debug mode.
      sync-tag:
        required: false
        type: string
        default: ""
        description: |
          If this is set, our linter will use this to make sure that every other
          job with the same `sync-tag` is identical.
      cuda-arch-list:
        required: false
        type: string
        default: "5.2"
        description: |
          List of CUDA architectures CI build should target.
      runner_prefix:
        required: false
        default: ""
        type: string
        description: Prefix for runner label
      runner:
        required: false
        type: string
        default: "linux.2xlarge"
        description: |
          Label of the runner this job should run on.
      test-matrix:
        required: false
        type: string
        description: |
          An option JSON description of what test configs to run later on. This
          is moved here from the Linux test workflow so that we can apply filter
          logic using test-config labels earlier and skip unnecessary builds
      selected-test-configs:
        description: |
          A comma-separated list of test configurations from the test matrix to keep,
          The empty list means we are going to keep every configurations by defaults
        required: false
        type: string
        default: ""
      s3-bucket:
        description: S3 bucket to download artifact
        required: false
        type: string
        default: "gha-artifacts"
      aws-role-to-assume:
        description: Role to assume for downloading artifacts
        required: false
        type: string
        default: ""
      use_split_build:
        description: |
          [Experimental] Build a libtorch only wheel and build pytorch such that
          are built from the libtorch wheel.
        required: false
        type: boolean
        default: false

    secrets:
      HUGGING_FACE_HUB_TOKEN:
        required: false
        description: |
          HF Auth token to avoid rate limits when downloading models or datasets from hub
      SCRIBE_GRAPHQL_ACCESS_TOKEN:
        required: false
        description: |
          FB app token to write to scribe endpoint


    outputs:
      docker-image:
        value: ${{ jobs.build.outputs.docker-image }}
        description: The docker image containing the built PyTorch.
      test-matrix:
        value: ${{ jobs.build.outputs.test-matrix }}
        description: An optional JSON description of what test configs to run later on.

jobs:
  build:
    environment: ${{ github.ref == 'refs/heads/main' && 'scribe-protected' || startsWith(github.ref, 'refs/heads/release/') && 'scribe-protected' || contains(github.event.pull_request.labels.*.name, 'ci-scribe') && 'scribe-pr' || '' }}
    # Don't run on forked repos
    if: github.repository_owner == 'pytorch'
    runs-on: ${{ inputs.runner_prefix}}${{ inputs.runner }}
    timeout-minutes: 240
    outputs:
      docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
      test-matrix: ${{ steps.filter.outputs.test-matrix }}
    steps:
      - name: Setup SSH (Click me for login details)
        uses: pytorch/test-infra/.github/actions/setup-ssh@main
        if: inputs.build-environment != 'linux-s390x-binary-manywheel'
        with:
          github-secret: ${{ secrets.GITHUB_TOKEN }}

      # [pytorch repo ref]
      # Use a pytorch/pytorch reference instead of a reference to the local
      # checkout because when we run this action we don't *have* a local
      # checkout. In other cases you should prefer a local checkout.
      - name: Checkout PyTorch
        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
        with:
          no-sudo: true

      - name: Setup Linux
        uses: ./.github/actions/setup-linux
        if: inputs.build-environment != 'linux-s390x-binary-manywheel'

      - name: configure aws credentials
        uses: aws-actions/configure-aws-credentials@v3
        if: ${{ inputs.aws-role-to-assume != '' && inputs.build-environment != 'linux-s390x-binary-manywheel' }}
        with:
          role-to-assume: ${{ inputs.aws-role-to-assume }}
          role-session-name: gha-linux-build
          aws-region: us-east-1

      - name: Calculate docker image
        id: calculate-docker-image
        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
        if: inputs.build-environment != 'linux-s390x-binary-manywheel'
        with:
          docker-image-name: ${{ inputs.docker-image-name }}

      - name: Use following to pull public copy of the image
        id: print-ghcr-mirror
        if: inputs.build-environment != 'linux-s390x-binary-manywheel'
        env:
          ECR_DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
        shell: bash
        run: |
          tag=${ECR_DOCKER_IMAGE##*/}
          echo "docker pull ghcr.io/pytorch/ci-image:${tag/:/-}"

      - name: Pull docker image
        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
        if: inputs.build-environment != 'linux-s390x-binary-manywheel'
        with:
          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}

      - name: Parse ref
        id: parse-ref
        run: .github/scripts/parse_ref.py

      - name: Get workflow job id
        id: get-job-id
        uses: ./.github/actions/get-workflow-job-id
        if: always()
        with:
          github-token: ${{ secrets.GITHUB_TOKEN }}

      # Apply the filter logic to the build step too if the test-config label is already there
      - name: Select all requested test configurations (if the test matrix is available)
        id: filter
        uses: ./.github/actions/filter-test-configs
        with:
          github-token: ${{ secrets.GITHUB_TOKEN }}
          test-matrix: ${{ inputs.test-matrix }}
          selected-test-configs: ${{ inputs.selected-test-configs }}
          job-name: ${{ steps.get-job-id.outputs.job-name }}

      - name: Download pytest cache
        uses: ./.github/actions/pytest-cache-download
        continue-on-error: true
        if: inputs.build-environment != 'linux-s390x-binary-manywheel'
        with:
          cache_dir: .pytest_cache
          job_identifier: ${{ github.workflow }}_${{ inputs.build-environment }}
          s3_bucket: ${{ inputs.s3-bucket }}

      - name: Build
        if: steps.filter.outputs.is-test-matrix-empty == 'False' || inputs.test-matrix == ''
        id: build
        env:
          BUILD_ENVIRONMENT: ${{ inputs.build-environment }}
          BRANCH: ${{ steps.parse-ref.outputs.branch }}
          # TODO duplicated
          AWS_DEFAULT_REGION: us-east-1
          PR_NUMBER: ${{ github.event.pull_request.number }}
          SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
          SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
          SCCACHE_REGION: us-east-1
          # Use the build environment here to make sure that all build jobs in the same environment
          # will share the same cache regardless of which workflow they belong. This should improve
          # the cache usage for jobs in non-pull workflows like periodic, slow, or inductor
          SCCACHE_S3_KEY_PREFIX: ${{ inputs.build-environment || github.workflow }}
          XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
          PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
          TORCH_CUDA_ARCH_LIST: ${{ inputs.cuda-arch-list }}
          DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
          DOCKER_IMAGE_S390X: ${{ inputs.docker-image-name }}
          XLA_CUDA: ${{ contains(inputs.build-environment, 'xla') && '0' || '' }}
          DEBUG: ${{ inputs.build-with-debug && '1' || '0' }}
          OUR_GITHUB_JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
          HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
          SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
          USE_SPLIT_BUILD: ${{ inputs.use_split_build }}
        run: |
          START_TIME=$(date +%s)
          if [[ ${BUILD_ENVIRONMENT} == *"s390x"* ]]; then
            JENKINS_USER=
            USED_IMAGE="${DOCKER_IMAGE_S390X}"
            # ensure that docker container cleanly exits in 12 hours
            # if for some reason cleanup action doesn't stop container
            # when job is cancelled
            DOCKER_SHELL_CMD="sleep 12h"

            # since some steps are skipped on s390x, if they are necessary, run them here
            env | grep '^GITHUB' >> "/tmp/github_env_${GITHUB_RUN_ID}"
            env | grep '^CI' >> "/tmp/github_env_${GITHUB_RUN_ID}"
          else
            JENKINS_USER="--user jenkins"
            USED_IMAGE="${DOCKER_IMAGE}"
            DOCKER_SHELL_CMD=
          fi

          # Leaving 1GB for the runner and other things
          TOTAL_AVAILABLE_MEMORY_IN_GB=$(awk '/MemTotal/ { printf "%.3f \n", $2/1024/1024 - 1 }' /proc/meminfo)
          # https://docs.docker.com/engine/containers/resource_constraints/#--memory-swap-details, the 3GB swap
          # comes from https://github.com/pytorch/test-infra/pull/6058
          TOTAL_MEMORY_WITH_SWAP=$(("${TOTAL_AVAILABLE_MEMORY_IN_GB%.*}" + 3))

          # detached container should get cleaned up by teardown_ec2_linux
          # Used for JENKINS_USER and DOCKER_SHELL_CMD, which can be empty
          # shellcheck disable=SC2086
          container_name=$(docker run \
            -e BUILD_ENVIRONMENT \
            -e MAX_JOBS="$(nproc --ignore=2)" \
            -e AWS_DEFAULT_REGION \
            -e PR_NUMBER \
            -e SHA1 \
            -e BRANCH \
            -e SCCACHE_BUCKET \
            -e SCCACHE_REGION \
            -e SCCACHE_S3_KEY_PREFIX \
            -e XLA_CUDA \
            -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
            -e SKIP_SCCACHE_INITIALIZATION=1 \
            -e TORCH_CUDA_ARCH_LIST \
            -e PR_LABELS \
            -e OUR_GITHUB_JOB_ID \
            -e HUGGING_FACE_HUB_TOKEN \
            -e SCRIBE_GRAPHQL_ACCESS_TOKEN \
            -e USE_SPLIT_BUILD \
            --memory="${TOTAL_AVAILABLE_MEMORY_IN_GB%.*}g" \
            --memory-swap="${TOTAL_MEMORY_WITH_SWAP}g" \
            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
            --security-opt seccomp=unconfined \
            --cap-add=SYS_PTRACE \
            --tty \
            --detach \
            ${JENKINS_USER} \
            -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
            -w /var/lib/jenkins/workspace \
            "${USED_IMAGE}" \
            ${DOCKER_SHELL_CMD}
          )
          docker exec -t "${container_name}" sh -c '.ci/pytorch/build.sh'

          END_TIME=$(date +%s)
          echo "build_time=$((END_TIME - START_TIME))" >> "$GITHUB_OUTPUT"

      - name: Archive artifacts into zip
        if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped'
        run: |
          zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .additional_ci_files

      - name: Store PyTorch Build Artifacts on S3
        uses: seemethere/upload-artifact-s3@v5
        if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped' && !inputs.use_split_build && inputs.build-environment != 'linux-s390x-binary-manywheel'
        with:
          name: ${{ inputs.build-environment }}
          retention-days: 14
          if-no-files-found: error
          path: artifacts.zip
          s3-bucket: ${{ inputs.s3-bucket }}

      - name: Store PyTorch Build Artifacts on S3 for split build
        uses: seemethere/upload-artifact-s3@v5
        if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped' && inputs.use_split_build && inputs.build-environment != 'linux-s390x-binary-manywheel'
        with:
          name: ${{ inputs.build-environment }}-experimental-split-build
          retention-days: 14
          if-no-files-found: error
          path: artifacts.zip
          s3-bucket: ${{ inputs.s3-bucket }}

      - name: Store PyTorch Build Artifacts for s390x
        uses: actions/upload-artifact@v4
        if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped' && !inputs.use_split_build && inputs.build-environment == 'linux-s390x-binary-manywheel'
        with:
          name: ${{ inputs.build-environment }}
          retention-days: 14
          if-no-files-found: error
          path: artifacts.zip

      - name: Store PyTorch Build Artifacts for s390x for split build
        uses: actions/upload-artifact@v4
        if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped' && inputs.use_split_build && inputs.build-environment == 'linux-s390x-binary-manywheel'
        with:
          name: ${{ inputs.build-environment }}-experimental-split-build
          retention-days: 14
          if-no-files-found: error
          path: artifacts.zip

      - name: Upload sccache stats
        if: steps.build.outcome != 'skipped' && inputs.build-environment != 'linux-s390x-binary-manywheel'
        uses: ./.github/actions/upload-sccache-stats
        with:
          github-token: ${{ secrets.GITHUB_TOKEN }}
          build-time: ${{ steps.build.outputs.build_time }}

      - name: Teardown Linux
        uses: pytorch/test-infra/.github/actions/teardown-linux@main
        if: always() && inputs.build-environment != 'linux-s390x-binary-manywheel'

      - name: Cleanup docker
        if: always() && inputs.build-environment == 'linux-s390x-binary-manywheel'
        shell: bash
        run: |
          # on s390x stop the container for clean worker stop
          docker stop -a || true
          docker kill -a || true