From 4df3ccddb763d2e60e84c851c0e36e35d6e7faee Mon Sep 17 00:00:00 2001 From: Guillaume LEGENDRE Date: Thu, 3 Oct 2024 14:39:49 +0200 Subject: [PATCH] Migrate the CI runners to the new clusters (#33849) * try fixing push-ci * move to new runners * move benchmark.yml to new runners * move doctest_job.yml to new runners * move doctests.yml to new runners * move push-important-models.yml to new runners * move self-pr-slow-ci.yml to new runners * fix typo Co-authored-by: Yih-Dar <2521628+ydshieh@users.noreply.github.com> * fix working directory Co-authored-by: Yih-Dar <2521628+ydshieh@users.noreply.github.com> * fix working directory Co-authored-by: Yih-Dar <2521628+ydshieh@users.noreply.github.com> * improve code Co-authored-by: Yih-Dar <2521628+ydshieh@users.noreply.github.com> --------- Co-authored-by: Yih-Dar <2521628+ydshieh@users.noreply.github.com> --- .github/workflows/benchmark.yml | 3 +- .github/workflows/doctest_job.yml | 3 +- .github/workflows/doctests.yml | 5 +- .github/workflows/push-important-models.yml | 3 +- .github/workflows/self-pr-slow-ci.yml | 36 ++++-- .github/workflows/self-push.yml | 133 +++++++++++++++----- utils/tests_fetcher.py | 1 + 7 files changed, 139 insertions(+), 45 deletions(-) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index cb9a3d7b7..75a837d69 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -13,7 +13,8 @@ env: jobs: benchmark: name: Benchmark - runs-on: [single-gpu, nvidia-gpu, a10, ci] + runs-on: + group: aws-g5-4xlarge-cache container: image: huggingface/transformers-all-latest-gpu options: --gpus all --privileged --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ diff --git a/.github/workflows/doctest_job.yml b/.github/workflows/doctest_job.yml index 98be98529..eb62b797b 100644 --- a/.github/workflows/doctest_job.yml +++ b/.github/workflows/doctest_job.yml @@ -27,7 +27,8 @@ jobs: fail-fast: false matrix: split_keys: ${{ fromJson(inputs.split_keys) }} - runs-on: [single-gpu, nvidia-gpu, t4, ci] + runs-on: + group: aws-g4dn-2xlarge-cache container: image: huggingface/transformers-all-latest-gpu options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ diff --git a/.github/workflows/doctests.yml b/.github/workflows/doctests.yml index 4b515c741..472b07684 100644 --- a/.github/workflows/doctests.yml +++ b/.github/workflows/doctests.yml @@ -14,7 +14,8 @@ env: jobs: setup: name: Setup - runs-on: [single-gpu, nvidia-gpu, t4, ci] + runs-on: + group: aws-g4dn-2xlarge-cache container: image: huggingface/transformers-all-latest-gpu options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ @@ -85,4 +86,4 @@ jobs: uses: actions/upload-artifact@v4 with: name: doc_test_results - path: doc_test_results \ No newline at end of file + path: doc_test_results diff --git a/.github/workflows/push-important-models.yml b/.github/workflows/push-important-models.yml index 41bcd43fc..1887af0f4 100644 --- a/.github/workflows/push-important-models.yml +++ b/.github/workflows/push-important-models.yml @@ -52,7 +52,8 @@ jobs: test_modified_files: needs: get_modified_models name: Slow & FA2 tests - runs-on: [single-gpu, nvidia-gpu, a10, ci] + runs-on: + group: aws-g5-4xlarge-cache container: image: huggingface/transformers-all-latest-gpu options: --gpus all --privileged --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ diff --git a/.github/workflows/self-pr-slow-ci.yml b/.github/workflows/self-pr-slow-ci.yml index 2287b5e3f..bcf4d4d68 100644 --- a/.github/workflows/self-pr-slow-ci.yml +++ b/.github/workflows/self-pr-slow-ci.yml @@ -65,8 +65,9 @@ jobs: fail-fast: false matrix: folders: ${{ fromJson(needs.find_models_to_run.outputs.models) }} - machine_type: [single-gpu, multi-gpu] - runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, ci] + machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache] + runs-on: + group: '${{ matrix.machine_type }}' container: image: huggingface/transformers-all-latest-gpu options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ @@ -99,6 +100,21 @@ jobs: run: | nvidia-smi + - name: Set `machine_type` for report and artifact names + working-directory: /transformers + shell: bash + run: | + echo "${{ matrix.machine_type }}" + if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then + machine_type=single-gpu + elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then + machine_type=multi-gpu + else + machine_type=${{ matrix.machine_type }} + fi + echo "$machine_type" + echo "machine_type=$machine_type" >> $GITHUB_ENV + - name: Environment working-directory: /transformers run: | @@ -113,23 +129,23 @@ jobs: run: | export CUDA_VISIBLE_DEVICES="$(python3 utils/set_cuda_devices_for_ci.py --test_folder ${{ matrix.folders }})" echo $CUDA_VISIBLE_DEVICES - python3 -m pytest -v -rsfE --make-reports=${{ matrix.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }} + python3 -m pytest -v -rsfE --make-reports=${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }} - name: Failure short reports if: ${{ failure() }} continue-on-error: true - run: cat /transformers/reports/${{ matrix.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports/failures_short.txt + run: cat /transformers/reports/${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports/failures_short.txt - name: Make sure report directory exists shell: bash run: | - mkdir -p /transformers/reports/${{ matrix.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports - echo "hello" > /transformers/reports/${{ matrix.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports/hello.txt - echo "${{ matrix.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports" + mkdir -p /transformers/reports/${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports + echo "hello" > /transformers/reports/${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports/hello.txt + echo "${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports" - - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_models_gpu_${{ env.matrix_folders }}_test_reports" + - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_models_gpu_${{ env.matrix_folders }}_test_reports" if: ${{ always() }} uses: actions/upload-artifact@v4 with: - name: ${{ matrix.machine_type }}_run_models_gpu_${{ env.matrix_folders }}_test_reports - path: /transformers/reports/${{ matrix.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports + name: ${{ env.machine_type }}_run_models_gpu_${{ env.matrix_folders }}_test_reports + path: /transformers/reports/${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports diff --git a/.github/workflows/self-push.yml b/.github/workflows/self-push.yml index b328f65d3..940495c28 100644 --- a/.github/workflows/self-push.yml +++ b/.github/workflows/self-push.yml @@ -32,8 +32,9 @@ jobs: name: Setup strategy: matrix: - machine_type: [single-gpu, multi-gpu] - runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, push-ci] + machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache] + runs-on: + group: '${{ matrix.machine_type }}' container: image: huggingface/transformers-all-latest-gpu-push-ci options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ @@ -131,8 +132,9 @@ jobs: fail-fast: false matrix: folders: ${{ fromJson(needs.setup.outputs.matrix) }} - machine_type: [single-gpu] - runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, push-ci] + machine_type: [aws-g4dn-2xlarge-cache] + runs-on: + group: '${{ matrix.machine_type }}' container: image: huggingface/transformers-all-latest-gpu-push-ci options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ @@ -162,6 +164,23 @@ jobs: echo "env.CI_BRANCH = ${{ env.CI_BRANCH }}" echo "env.CI_SHA = ${{ env.CI_SHA }}" + - name: Set `machine_type` for report and artifact names + working-directory: /transformers + shell: bash + run: | + echo "${{ matrix.machine_type }}" + + if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then + machine_type=single-gpu + elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then + machine_type=multi-gpu + else + machine_type=${{ matrix.machine_type }} + fi + + echo "$machine_type" + echo "machine_type=$machine_type" >> $GITHUB_ENV + - name: Update clone using environment variables working-directory: /transformers run: | @@ -203,19 +222,19 @@ jobs: - name: Run all non-slow selected tests on GPU working-directory: /transformers run: | - python3 -m pytest -n 2 --dist=loadfile -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} ${{ fromJson(needs.setup.outputs.test_map)[matrix.folders] }} + python3 -m pytest -n 2 --dist=loadfile -v --make-reports=${{ env.machine_type }}_tests_gpu_${{ matrix.folders }} ${{ fromJson(needs.setup.outputs.test_map)[matrix.folders] }} - name: Failure short reports if: ${{ failure() }} continue-on-error: true - run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt + run: cat /transformers/reports/${{ env.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt - - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports" + - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports" if: ${{ always() }} uses: actions/upload-artifact@v4 with: - name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports - path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} + name: ${{ env.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports + path: /transformers/reports/${{ env.machine_type }}_tests_gpu_${{ matrix.folders }} run_tests_multi_gpu: name: Model tests @@ -226,8 +245,9 @@ jobs: fail-fast: false matrix: folders: ${{ fromJson(needs.setup.outputs.matrix) }} - machine_type: [multi-gpu] - runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, push-ci] + machine_type: [aws-g4dn-12xlarge-cache] + runs-on: + group: '${{ matrix.machine_type }}' container: image: huggingface/transformers-all-latest-gpu-push-ci options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ @@ -257,6 +277,23 @@ jobs: echo "env.CI_BRANCH = ${{ env.CI_BRANCH }}" echo "env.CI_SHA = ${{ env.CI_SHA }}" + - name: Set `machine_type` for report and artifact names + working-directory: /transformers + shell: bash + run: | + echo "${{ matrix.machine_type }}" + + if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then + machine_type=single-gpu + elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then + machine_type=multi-gpu + else + machine_type=${{ matrix.machine_type }} + fi + + echo "$machine_type" + echo "machine_type=$machine_type" >> $GITHUB_ENV + - name: Update clone using environment variables working-directory: /transformers run: | @@ -300,19 +337,19 @@ jobs: MKL_SERVICE_FORCE_INTEL: 1 working-directory: /transformers run: | - python3 -m pytest -n 2 --dist=loadfile -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} ${{ fromJson(needs.setup.outputs.test_map)[matrix.folders] }} + python3 -m pytest -n 2 --dist=loadfile -v --make-reports=${{ env.machine_type }}_tests_gpu_${{ matrix.folders }} ${{ fromJson(needs.setup.outputs.test_map)[matrix.folders] }} - name: Failure short reports if: ${{ failure() }} continue-on-error: true - run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt + run: cat /transformers/reports/${{ env.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt - - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports" + - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports" if: ${{ always() }} uses: actions/upload-artifact@v4 with: - name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports - path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} + name: ${{ env.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports + path: /transformers/reports/${{ env.machine_type }}_tests_gpu_${{ matrix.folders }} run_tests_torch_cuda_extensions_single_gpu: name: Torch CUDA extension tests @@ -321,8 +358,9 @@ jobs: strategy: fail-fast: false matrix: - machine_type: [single-gpu] - runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, push-ci] + machine_type: [aws-g4dn-2xlarge-cache] + runs-on: + group: '${{ matrix.machine_type }}' container: image: huggingface/transformers-pytorch-deepspeed-latest-gpu-push-ci options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ @@ -352,6 +390,23 @@ jobs: echo "env.CI_BRANCH = ${{ env.CI_BRANCH }}" echo "env.CI_SHA = ${{ env.CI_SHA }}" + - name: Set `machine_type` for report and artifact names + working-directory: /workspace/transformers + shell: bash + run: | + echo "${{ matrix.machine_type }}" + + if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then + machine_type=single-gpu + elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then + machine_type=multi-gpu + else + machine_type=${{ matrix.machine_type }} + fi + + echo "$machine_type" + echo "machine_type=$machine_type" >> $GITHUB_ENV + - name: Update clone using environment variables working-directory: /workspace/transformers run: | @@ -392,19 +447,19 @@ jobs: working-directory: /workspace/transformers # TODO: Here we pass all tests in the 2 folders for simplicity. It's better to pass only the identified tests. run: | - python -m pytest -n 1 --dist=loadfile -v --make-reports=${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports tests/deepspeed tests/extended + python -m pytest -n 1 --dist=loadfile -v --make-reports=${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports tests/deepspeed tests/extended - name: Failure short reports if: ${{ failure() }} continue-on-error: true - run: cat /workspace/transformers/reports/${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports/failures_short.txt + run: cat /workspace/transformers/reports/${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports/failures_short.txt - - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports" + - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports" if: ${{ always() }} uses: actions/upload-artifact@v4 with: - name: ${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports - path: /workspace/transformers/reports/${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports + name: ${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports + path: /workspace/transformers/reports/${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports run_tests_torch_cuda_extensions_multi_gpu: name: Torch CUDA extension tests @@ -413,8 +468,9 @@ jobs: strategy: fail-fast: false matrix: - machine_type: [multi-gpu] - runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, push-ci] + machine_type: [aws-g4dn-12xlarge-cache] + runs-on: + group: '${{ matrix.machine_type }}' container: image: huggingface/transformers-pytorch-deepspeed-latest-gpu-push-ci options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ @@ -444,6 +500,23 @@ jobs: echo "env.CI_BRANCH = ${{ env.CI_BRANCH }}" echo "env.CI_SHA = ${{ env.CI_SHA }}" + - name: Set `machine_type` for report and artifact names + working-directory: /workspace/transformers + shell: bash + run: | + echo "${{ matrix.machine_type }}" + + if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then + machine_type=single-gpu + elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then + machine_type=multi-gpu + else + machine_type=${{ matrix.machine_type }} + fi + + echo "$machine_type" + echo "machine_type=$machine_type" >> $GITHUB_ENV + - name: Update clone using environment variables working-directory: /workspace/transformers run: | @@ -484,19 +557,19 @@ jobs: working-directory: /workspace/transformers # TODO: Here we pass all tests in the 2 folders for simplicity. It's better to pass only the identified tests. run: | - python -m pytest -n 1 --dist=loadfile -v --make-reports=${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports tests/deepspeed tests/extended + python -m pytest -n 1 --dist=loadfile -v --make-reports=${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports tests/deepspeed tests/extended - name: Failure short reports if: ${{ failure() }} continue-on-error: true - run: cat /workspace/transformers/reports/${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports/failures_short.txt + run: cat /workspace/transformers/reports/${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports/failures_short.txt - - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports" + - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports" if: ${{ always() }} uses: actions/upload-artifact@v4 with: - name: ${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports - path: /workspace/transformers/reports/${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports + name: ${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports + path: /workspace/transformers/reports/${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports send_results: name: Send results to webhook diff --git a/utils/tests_fetcher.py b/utils/tests_fetcher.py index b8408f9d4..9e15f2e11 100644 --- a/utils/tests_fetcher.py +++ b/utils/tests_fetcher.py @@ -1153,6 +1153,7 @@ JOB_TO_TEST_FILE = { def create_test_list_from_filter(full_test_list, out_path): + os.makedirs(out_path, exist_ok=True) all_test_files = "\n".join(full_test_list) for job_name, _filter in JOB_TO_TEST_FILE.items(): file_name = os.path.join(out_path, f"{job_name}_test_list.txt")