diff --git a/.github/workflows/self-push.yml b/.github/workflows/self-push.yml index 4109dfd14..e78757705 100644 --- a/.github/workflows/self-push.yml +++ b/.github/workflows/self-push.yml @@ -73,7 +73,7 @@ jobs: echo "::set-output name=test_map::$test_map" run_tests_single_gpu: - name: Model Tests on single GPU + name: Model tests needs: setup # `dummy` means there is no test to run if: contains(fromJson(needs.setup.outputs.matrix), 'dummy') != true @@ -81,8 +81,8 @@ jobs: fail-fast: false matrix: folders: ${{ fromJson(needs.setup.outputs.matrix) }} - machines: [single-gpu] - runs-on: [self-hosted, docker-gpu, '${{ matrix.machines }}'] + machine_type: [single-gpu] + runs-on: [self-hosted, docker-gpu, '${{ matrix.machine_type }}'] container: image: huggingface/transformers-all-latest-gpu options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ @@ -117,22 +117,22 @@ jobs: - name: Run all non-slow selected tests on GPU working-directory: /transformers run: | - python3 -m pytest -n 2 --dist=loadfile -v --make-reports=${{ matrix.machines }}_tests_gpu_${{ matrix.folders }} ${{ fromJson(needs.setup.outputs.test_map)[matrix.folders] }} + python3 -m pytest -n 2 --dist=loadfile -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} ${{ fromJson(needs.setup.outputs.test_map)[matrix.folders] }} - name: Failure short reports if: ${{ failure() }} continue-on-error: true - run: cat /transformers/reports/${{ matrix.machines }}_tests_gpu_${{ matrix.folders }}/failures_short.txt + run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt - name: Test suite reports artifacts if: ${{ always() }} uses: actions/upload-artifact@v2 with: - name: ${{ matrix.machines }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports - path: /transformers/reports/${{ matrix.machines }}_tests_gpu_${{ matrix.folders }} + name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports + path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} run_tests_multi_gpu: - name: Model Tests on multi GPUs + name: Model tests needs: setup # `dummy` means there is no test to run if: contains(fromJson(needs.setup.outputs.matrix), 'dummy') != true @@ -140,8 +140,8 @@ jobs: fail-fast: false matrix: folders: ${{ fromJson(needs.setup.outputs.matrix) }} - machines: [multi-gpu] - runs-on: [self-hosted, docker-gpu, '${{ matrix.machines }}'] + machine_type: [multi-gpu] + runs-on: [self-hosted, docker-gpu, '${{ matrix.machine_type }}'] container: image: huggingface/transformers-all-latest-gpu options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ @@ -178,29 +178,29 @@ jobs: MKL_SERVICE_FORCE_INTEL: 1 working-directory: /transformers run: | - python3 -m pytest -n 2 --dist=loadfile -v --make-reports=${{ matrix.machines }}_tests_gpu_${{ matrix.folders }} ${{ fromJson(needs.setup.outputs.test_map)[matrix.folders] }} + python3 -m pytest -n 2 --dist=loadfile -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} ${{ fromJson(needs.setup.outputs.test_map)[matrix.folders] }} - name: Failure short reports if: ${{ failure() }} continue-on-error: true - run: cat /transformers/reports/${{ matrix.machines }}_tests_gpu_${{ matrix.folders }}/failures_short.txt + run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt - name: Test suite reports artifacts if: ${{ always() }} uses: actions/upload-artifact@v2 with: - name: ${{ matrix.machines }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports - path: /transformers/reports/${{ matrix.machines }}_tests_gpu_${{ matrix.folders }} + name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports + path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} run_tests_torch_cuda_extensions_single_gpu: - name: Torch CUDA extension tests on single GPU + name: Torch CUDA extension tests needs: setup if: contains(fromJson(needs.setup.outputs.matrix), 'deepspeed') || contains(fromJson(needs.setup.outputs.matrix), 'extended') strategy: fail-fast: false matrix: - machines: [single-gpu] - runs-on: [self-hosted, docker-gpu, '${{ matrix.machines }}'] + machine_type: [single-gpu] + runs-on: [self-hosted, docker-gpu, '${{ matrix.machine_type }}'] container: image: nvcr.io/nvidia/pytorch:21.03-py3 options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ @@ -227,29 +227,29 @@ jobs: - name: Run all non-slow selected tests on GPU # TODO: Here we pass all tests in the 2 folders for simplicity. It's better to pass only the identified tests. run: | - python -m pytest -n 1 --dist=loadfile -v --make-reports=${{ matrix.machines }}_tests_torch_cuda_extensions_gpu tests/deepspeed tests/extended + python -m pytest -n 1 --dist=loadfile -v --make-reports=${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu tests/deepspeed tests/extended - name: Failure short reports if: ${{ failure() }} continue-on-error: true - run: cat reports/${{ matrix.machines }}_tests_torch_cuda_extensions_gpu/failures_short.txt + run: cat reports/${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu/failures_short.txt - name: Test suite reports artifacts if: ${{ always() }} uses: actions/upload-artifact@v2 with: - name: ${{ matrix.machines }}_run_tests_torch_cuda_extensions_gpu_test_reports - path: reports/${{ matrix.machines }}_tests_torch_cuda_extensions_gpu + name: ${{ matrix.machine_type }}_run_tests_torch_cuda_extensions_gpu_test_reports + path: reports/${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu run_tests_torch_cuda_extensions_multi_gpu: - name: Torch CUDA extension tests on multi GPUs + name: Torch CUDA extension tests needs: setup if: contains(fromJson(needs.setup.outputs.matrix), 'deepspeed') || contains(fromJson(needs.setup.outputs.matrix), 'extended') strategy: fail-fast: false matrix: - machines: [multi-gpu] - runs-on: [self-hosted, docker-gpu, '${{ matrix.machines }}'] + machine_type: [multi-gpu] + runs-on: [self-hosted, docker-gpu, '${{ matrix.machine_type }}'] container: image: nvcr.io/nvidia/pytorch:21.03-py3 options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ @@ -277,19 +277,19 @@ jobs: - name: Run all non-slow selected tests on GPU # TODO: Here we pass all tests in the 2 folders for simplicity. It's better to pass only the identified tests. run: | - python -m pytest -n 1 --dist=loadfile -v --make-reports=${{ matrix.machines }}_tests_torch_cuda_extensions_gpu tests/deepspeed tests/extended + python -m pytest -n 1 --dist=loadfile -v --make-reports=${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu tests/deepspeed tests/extended - name: Failure short reports if: ${{ failure() }} continue-on-error: true - run: cat reports/${{ matrix.machines }}_tests_torch_cuda_extensions_gpu/failures_short.txt + run: cat reports/${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu/failures_short.txt - name: Test suite reports artifacts if: ${{ always() }} uses: actions/upload-artifact@v2 with: - name: ${{ matrix.machines }}_run_tests_torch_cuda_extensions_gpu_test_reports - path: reports/${{ matrix.machines }}_tests_torch_cuda_extensions_gpu + name: ${{ matrix.machine_type }}_run_tests_torch_cuda_extensions_gpu_test_reports + path: reports/${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu send_results: name: Send results to webhook diff --git a/.github/workflows/self-scheduled.yml b/.github/workflows/self-scheduled.yml index 810422b2d..89634fdd6 100644 --- a/.github/workflows/self-scheduled.yml +++ b/.github/workflows/self-scheduled.yml @@ -26,8 +26,8 @@ jobs: name: Setup strategy: matrix: - machines: [multi-gpu-docker, single-gpu-docker] - runs-on: ${{ matrix.machines }} + machine_type: [single-gpu, multi-gpu] + runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }} container: image: huggingface/transformers-all-latest-gpu options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ @@ -69,8 +69,8 @@ jobs: fail-fast: false matrix: folders: ${{ fromJson(needs.setup.outputs.matrix) }} - machines: [single-gpu-docker] - runs-on: ${{ matrix.machines }} + machine_type: [single-gpu] + runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }} container: image: huggingface/transformers-all-latest-gpu options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ @@ -87,34 +87,25 @@ jobs: echo "$matrix_folders" echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV - # Set machine type, i.e. `single-gpu` or `multi-gpu`. Here we just remove `-docker`. - - name: Set machine type from ${{ matrix.machines }} - shell: bash - run: | - machine_type=${{ matrix.machines }} - machine_type=${machine_type/'-docker'/''} - echo "machine_type=$machine_type" - echo "machine_type=$machine_type" >> $GITHUB_ENV - - name: Update clone working-directory: /transformers run: git fetch && git checkout ${{ github.sha }} - name: Run all tests on GPU working-directory: /transformers - run: python3 -m pytest -v --make-reports=${{ env.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }} + run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }} - name: Failure short reports if: ${{ failure() }} continue-on-error: true - run: cat /transformers/reports/${{ env.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt + run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt - name: Test suite reports artifacts if: ${{ always() }} uses: actions/upload-artifact@v2 with: - name: ${{ env.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports - path: /transformers/reports/${{ env.machine_type }}_tests_gpu_${{ matrix.folders }} + name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports + path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} run_tests_multi_gpu: name: Model tests @@ -122,8 +113,8 @@ jobs: fail-fast: false matrix: folders: ${{ fromJson(needs.setup.outputs.matrix) }} - machines: [multi-gpu-docker] - runs-on: ${{ matrix.machines }} + machine_type: [multi-gpu] + runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }} container: image: huggingface/transformers-all-latest-gpu options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ @@ -140,34 +131,25 @@ jobs: echo "$matrix_folders" echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV - # Set machine type, i.e. `single-gpu` or `multi-gpu`. Here we just remove `-docker`. - - name: Set machine type from ${{ matrix.machines }} - shell: bash - run: | - machine_type=${{ matrix.machines }} - machine_type=${machine_type/'-docker'/''} - echo "machine_type=$machine_type" - echo "machine_type=$machine_type" >> $GITHUB_ENV - - name: Update clone working-directory: /transformers run: git fetch && git checkout ${{ github.sha }} - name: Run all tests on GPU working-directory: /transformers - run: python3 -m pytest -v --make-reports=${{ env.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }} + run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }} - name: Failure short reports if: ${{ failure() }} continue-on-error: true - run: cat /transformers/reports/${{ env.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt + run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt - name: Test suite reports artifacts if: ${{ always() }} uses: actions/upload-artifact@v2 with: - name: ${{ env.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports - path: /transformers/reports/${{ env.machine_type }}_tests_gpu_${{ matrix.folders }} + name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports + path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} run_examples_gpu: name: Examples directory @@ -204,22 +186,13 @@ jobs: strategy: fail-fast: false matrix: - machines: [multi-gpu-docker, single-gpu-docker] - runs-on: ${{ matrix.machines }} + machine_type: [single-gpu, multi-gpu] + runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }} container: image: huggingface/transformers-pytorch-gpu options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ needs: setup steps: - # Set machine type, i.e. `single-gpu` or `multi-gpu`. Here we just remove `-docker`. - - name: Set machine type from ${{ matrix.machines }} - shell: bash - run: | - machine_type=${{ matrix.machines }} - machine_type=${machine_type/'-docker'/''} - echo "machine_type=$machine_type" - echo "machine_type=$machine_type" >> $GITHUB_ENV - - name: Update clone working-directory: /transformers run: git fetch && git checkout ${{ github.sha }} @@ -229,41 +202,32 @@ jobs: env: RUN_PIPELINE_TESTS: yes run: | - python3 -m pytest -n 1 -v --dist=loadfile -m is_pipeline_test --make-reports=${{ env.machine_type }}_tests_torch_pipeline_gpu tests + python3 -m pytest -n 1 -v --dist=loadfile -m is_pipeline_test --make-reports=${{ matrix.machine_type }}_tests_torch_pipeline_gpu tests - name: Failure short reports if: ${{ failure() }} continue-on-error: true - run: cat /transformers/reports/${{ env.machine_type }}_tests_torch_pipeline_gpu/failures_short.txt + run: cat /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu/failures_short.txt - name: Test suite reports artifacts if: ${{ always() }} uses: actions/upload-artifact@v2 with: - name: ${{ env.machine_type }}_run_tests_torch_pipeline_gpu - path: /transformers/reports/${{ env.machine_type }}_tests_torch_pipeline_gpu + name: ${{ matrix.machine_type }}_run_tests_torch_pipeline_gpu + path: /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu run_pipelines_tf_gpu: name: TensorFlow pipelines strategy: fail-fast: false matrix: - machines: [multi-gpu-docker, single-gpu-docker] - runs-on: ${{ matrix.machines }} + machine_type: [single-gpu, multi-gpu] + runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }} container: image: huggingface/transformers-tensorflow-gpu options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ needs: setup steps: - # Set machine type, i.e. `single-gpu` or `multi-gpu`. Here we just remove `-docker`. - - name: Set machine type from ${{ matrix.machines }} - shell: bash - run: | - machine_type=${{ matrix.machines }} - machine_type=${machine_type/'-docker'/''} - echo "machine_type=$machine_type" - echo "machine_type=$machine_type" >> $GITHUB_ENV - - name: Update clone working-directory: /transformers run: | @@ -274,41 +238,32 @@ jobs: env: RUN_PIPELINE_TESTS: yes run: | - python3 -m pytest -n 1 -v --dist=loadfile -m is_pipeline_test --make-reports=${{ env.machine_type }}_tests_tf_pipeline_gpu tests + python3 -m pytest -n 1 -v --dist=loadfile -m is_pipeline_test --make-reports=${{ matrix.machine_type }}_tests_tf_pipeline_gpu tests - name: Failure short reports if: ${{ always() }} run: | - cat /transformers/reports/${{ env.machine_type }}_tests_tf_pipeline_gpu/failures_short.txt + cat /transformers/reports/${{ matrix.machine_type }}_tests_tf_pipeline_gpu/failures_short.txt - name: Test suite reports artifacts if: ${{ always() }} uses: actions/upload-artifact@v2 with: - name: ${{ env.machine_type }}_run_tests_tf_pipeline_gpu - path: /transformers/reports/${{ env.machine_type }}_tests_tf_pipeline_gpu + name: ${{ matrix.machine_type }}_run_tests_tf_pipeline_gpu + path: /transformers/reports/${{ matrix.machine_type }}_tests_tf_pipeline_gpu run_all_tests_torch_cuda_extensions_gpu: name: Torch CUDA extension tests strategy: fail-fast: false matrix: - machines: [multi-gpu-docker, single-gpu-docker] - runs-on: ${{ matrix.machines }} + machine_type: [single-gpu, multi-gpu] + runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }} needs: setup container: image: huggingface/transformers-pytorch-deepspeed-latest-gpu options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ steps: - # Set machine type, i.e. `single-gpu` or `multi-gpu`. Here we just remove `-docker`. - - name: Set machine type from ${{ matrix.machines }} - shell: bash - run: | - machine_type=${{ matrix.machines }} - machine_type=${machine_type/'-docker'/''} - echo "machine_type=$machine_type" - echo "machine_type=$machine_type" >> $GITHUB_ENV - - name: Update clone working-directory: /workspace/transformers run: git fetch && git checkout ${{ github.sha }} @@ -324,19 +279,19 @@ jobs: - name: Run all tests on GPU working-directory: /workspace/transformers run: | - python -m pytest -v --make-reports=${{ env.machine_type }}_tests_torch_cuda_extensions_gpu tests/deepspeed tests/extended + python -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu tests/deepspeed tests/extended - name: Failure short reports if: ${{ failure() }} continue-on-error: true - run: cat /workspace/transformers/reports/${{ env.machine_type }}_tests_torch_cuda_extensions_gpu/failures_short.txt + run: cat /workspace/transformers/reports/${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu/failures_short.txt - name: Test suite reports artifacts if: ${{ always() }} uses: actions/upload-artifact@v2 with: - name: ${{ env.machine_type }}_run_tests_torch_cuda_extensions_gpu_test_reports - path: /workspace/transformers/reports/${{ env.machine_type }}_tests_torch_cuda_extensions_gpu + name: ${{ matrix.machine_type }}_run_tests_torch_cuda_extensions_gpu_test_reports + path: /workspace/transformers/reports/${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu send_results: diff --git a/utils/notification_service.py b/utils/notification_service.py index 29c5649d2..0b7e8387f 100644 --- a/utils/notification_service.py +++ b/utils/notification_service.py @@ -621,7 +621,8 @@ if __name__ == "__main__": if "stats" in artifact: # Link to the GitHub Action job model_results[model]["job_link"] = github_actions_job_links.get( - f"Model tests ({model}, {artifact_path['gpu']}-gpu)" + # The job names use `matrix.folder` which contain things like `models/bert` instead of `models_bert` + f"Model tests ({model.replace('models_', 'models/')}, {artifact_path['gpu']}-gpu)" ) failed, success, time_spent = handle_test_results(artifact["stats"])