2024-06-29 04:48:06 +00:00
|
|
|
from __future__ import annotations
|
|
|
|
|
|
2022-06-30 07:27:44 +00:00
|
|
|
import gzip
|
|
|
|
|
import io
|
|
|
|
|
import json
|
2024-10-01 21:47:46 +00:00
|
|
|
import math
|
2022-06-12 17:29:01 +00:00
|
|
|
import os
|
2024-07-08 22:58:32 +00:00
|
|
|
import time
|
2022-06-12 17:29:01 +00:00
|
|
|
import zipfile
|
2024-10-02 23:19:28 +00:00
|
|
|
from functools import lru_cache
|
2022-06-12 17:29:01 +00:00
|
|
|
from pathlib import Path
|
PEP585 update - benchmarks tools torchgen (#145101)
This is one of a series of PRs to update us to PEP585 (changing Dict -> dict, List -> list, etc). Most of the PRs were completely automated with RUFF as follows:
Since RUFF UP006 is considered an "unsafe" fix first we need to enable unsafe fixes:
```
--- a/tools/linter/adapters/ruff_linter.py
+++ b/tools/linter/adapters/ruff_linter.py
@@ -313,6 +313,7 @@
"ruff",
"check",
"--fix-only",
+ "--unsafe-fixes",
"--exit-zero",
*([f"--config={config}"] if config else []),
"--stdin-filename",
```
Then we need to tell RUFF to allow UP006 (as a final PR once all of these have landed this will be made permanent):
```
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -40,7 +40,7 @@
[tool.ruff]
-target-version = "py38"
+target-version = "py39"
line-length = 88
src = ["caffe2", "torch", "torchgen", "functorch", "test"]
@@ -87,7 +87,6 @@
"SIM116", # Disable Use a dictionary instead of consecutive `if` statements
"SIM117",
"SIM118",
- "UP006", # keep-runtime-typing
"UP007", # keep-runtime-typing
]
select = [
```
Finally running `lintrunner -a --take RUFF` will fix up the deprecated uses.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/145101
Approved by: https://github.com/bobrenjc93
2025-01-17 23:13:25 +00:00
|
|
|
from typing import Any, Callable, Optional
|
2022-06-12 17:29:01 +00:00
|
|
|
|
|
|
|
|
import boto3 # type: ignore[import]
|
2022-06-30 07:27:44 +00:00
|
|
|
import requests
|
2022-06-12 17:29:01 +00:00
|
|
|
|
2024-06-29 04:48:06 +00:00
|
|
|
|
2022-06-12 17:29:01 +00:00
|
|
|
PYTORCH_REPO = "https://api.github.com/repos/pytorch/pytorch"
|
2024-10-02 23:19:28 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
@lru_cache
|
|
|
|
|
def get_s3_resource() -> Any:
|
|
|
|
|
return boto3.resource("s3")
|
|
|
|
|
|
Fix rerun disabled test uploading logic (#103476)
After https://github.com/pytorch/pytorch/pull/102107, rerunning disabled tests only collect and run disable tests. A side effect of this change is that the skip message `Test is enabled but --rerun-disabled-tests verification mode is set, so only disabled tests are run` isn't in the test report anymore as these non-disabled tests are not going to be collected in the first place. This breaks the logic in the uploading script that depends on this string to know if the test report belongs to a rerunning disabled tests workflow.
* This PR updates the logic in `is_rerun_disabled_tests` check to count the number of times a test is run instead. In rerunning disabled tests mode, a test is run 50 times by default and 15 times for distributed tests (to avoid timeout). Both these numbers are larger than the max number of retries a test can get normally (3 x 3)
* This also removes the hacky `is_rerun_disabled_tests` check in `tools/stats/upload_test_stats.py` as rerun disabled tests reports are now very small (50 x the number of disabled tests)
### Testing
* `test_gradgrad_nn_GroupNorm_cuda_float64` now shows up correctly https://github.com/pytorch/pytorch/issues/98678
```
python3 -m tools.stats.check_disabled_tests --workflow-run-id 5229037746 --workflow-run-attempt 1 --repo "pytorch/pytorch"
Using temporary directory: /var/folders/x4/2kd9r0fn5b9bf_sbcw16fxsc0000gn/T/tmpdojg5vq5
Downloading test-reports-test-default-1-4-linux.g5.4xlarge.nvidia.gpu_14154925022.zip
Downloading test-reports-test-default-1-4-linux.g5.4xlarge.nvidia.gpu_14154925093.zip
Downloading test-reports-test-default-2-4-linux.g5.4xlarge.nvidia.gpu_14154925167.zip
Downloading test-reports-test-default-2-4-linux.g5.4xlarge.nvidia.gpu_14154925226.zip
Downloading test-reports-test-default-3-4-linux.g5.4xlarge.nvidia.gpu_14154925295.zip
Downloading test-reports-test-default-3-4-linux.g5.4xlarge.nvidia.gpu_14154925371.zip
Downloading test-reports-test-default-4-4-linux.g5.4xlarge.nvidia.gpu_14154925453.zip
Downloading test-reports-test-default-4-4-linux.g5.4xlarge.nvidia.gpu_14154925536.zip
Downloading test-reports-test-slow-1-1-linux.2xlarge_14154853469.zip
Downloading test-reports-test-slow-1-1-linux.rocm.gpu_14154932523.zip
Downloading test-reports-test-slow-1-1-linux.rocm.gpu_14154932563.zip
Downloading test-reports-test-slow-1-2-linux.4xlarge_14154873704.zip
Downloading test-reports-test-slow-1-2-linux.g5.4xlarge.nvidia.gpu_14154931154.zip
Downloading test-reports-test-slow-1-2-linux.g5.4xlarge.nvidia.gpu_14154931186.zip
Downloading test-reports-test-slow-2-2-linux.4xlarge_14154873756.zip
Downloading test-reports-test-slow-2-2-linux.g5.4xlarge.nvidia.gpu_14154931225.zip
Downloading test-reports-test-slow-2-2-linux.g5.4xlarge.nvidia.gpu_14154931267.zip
Extracting test-reports-test-default-1-4-linux.g5.4xlarge.nvidia.gpu_14154925022.zip to unzipped-test-reports-test-default-1-4-linux.g5.4xlarge.nvidia.gpu_14154925022
Extracting test-reports-test-default-1-4-linux.g5.4xlarge.nvidia.gpu_14154925093.zip to unzipped-test-reports-test-default-1-4-linux.g5.4xlarge.nvidia.gpu_14154925093
Extracting test-reports-test-default-2-4-linux.g5.4xlarge.nvidia.gpu_14154925167.zip to unzipped-test-reports-test-default-2-4-linux.g5.4xlarge.nvidia.gpu_14154925167
Extracting test-reports-test-default-2-4-linux.g5.4xlarge.nvidia.gpu_14154925226.zip to unzipped-test-reports-test-default-2-4-linux.g5.4xlarge.nvidia.gpu_14154925226
Extracting test-reports-test-default-3-4-linux.g5.4xlarge.nvidia.gpu_14154925295.zip to unzipped-test-reports-test-default-3-4-linux.g5.4xlarge.nvidia.gpu_14154925295
Extracting test-reports-test-default-3-4-linux.g5.4xlarge.nvidia.gpu_14154925371.zip to unzipped-test-reports-test-default-3-4-linux.g5.4xlarge.nvidia.gpu_14154925371
Extracting test-reports-test-default-4-4-linux.g5.4xlarge.nvidia.gpu_14154925453.zip to unzipped-test-reports-test-default-4-4-linux.g5.4xlarge.nvidia.gpu_14154925453
Extracting test-reports-test-default-4-4-linux.g5.4xlarge.nvidia.gpu_14154925536.zip to unzipped-test-reports-test-default-4-4-linux.g5.4xlarge.nvidia.gpu_14154925536
Extracting test-reports-test-slow-1-1-linux.2xlarge_14154853469.zip to unzipped-test-reports-test-slow-1-1-linux.2xlarge_14154853469
Extracting test-reports-test-slow-1-1-linux.rocm.gpu_14154932523.zip to unzipped-test-reports-test-slow-1-1-linux.rocm.gpu_14154932523
Extracting test-reports-test-slow-1-1-linux.rocm.gpu_14154932563.zip to unzipped-test-reports-test-slow-1-1-linux.rocm.gpu_14154932563
Extracting test-reports-test-slow-1-2-linux.4xlarge_14154873704.zip to unzipped-test-reports-test-slow-1-2-linux.4xlarge_14154873704
Extracting test-reports-test-slow-1-2-linux.g5.4xlarge.nvidia.gpu_14154931154.zip to unzipped-test-reports-test-slow-1-2-linux.g5.4xlarge.nvidia.gpu_14154931154
Extracting test-reports-test-slow-1-2-linux.g5.4xlarge.nvidia.gpu_14154931186.zip to unzipped-test-reports-test-slow-1-2-linux.g5.4xlarge.nvidia.gpu_14154931186
Extracting test-reports-test-slow-2-2-linux.4xlarge_14154873756.zip to unzipped-test-reports-test-slow-2-2-linux.4xlarge_14154873756
Extracting test-reports-test-slow-2-2-linux.g5.4xlarge.nvidia.gpu_14154931225.zip to unzipped-test-reports-test-slow-2-2-linux.g5.4xlarge.nvidia.gpu_14154931225
Extracting test-reports-test-slow-2-2-linux.g5.4xlarge.nvidia.gpu_14154931267.zip to unzipped-test-reports-test-slow-2-2-linux.g5.4xlarge.nvidia.gpu_14154931267
Downloading test-reports-runattempt1-test-slow-1-1-linux.rocm.gpu_14154932523.zip
Downloading test-reports-runattempt1-test-slow-1-1-linux.rocm.gpu_14154932563.zip
Extracting test-reports-runattempt1-test-slow-1-1-linux.rocm.gpu_14154932523.zip to unzipped-test-reports-runattempt1-test-slow-1-1-linux.rocm.gpu_14154932523
Extracting test-reports-runattempt1-test-slow-1-1-linux.rocm.gpu_14154932563.zip to unzipped-test-reports-runattempt1-test-slow-1-1-linux.rocm.gpu_14154932563
The following 32 tests should be re-enabled:
test_huge_index (__main__.TestCuda) from test_cuda.py
test_conv_bn_fuse_cpu (__main__.CpuTests) from inductor/test_torchinductor.py
test_multi_threads (__main__.TestTorchrun) from backends/xeon/test_launch.py
test_huge_index (__main__.TestCuda) from test_cuda_expandable_segments.py
test_memory_timeline_no_id (__main__.TestMemoryProfilerE2E) from profiler/test_memory_profiler.py
test_inverse_errors_large_cuda_float64 (__main__.TestLinalgCUDA) from test_linalg.py
test_trace_dependencies (__main__.TestAnalyze) from test_package.py
test_caching_pinned_memory (__main__.TestCuda) from test_cuda_expandable_segments.py
test_graph_concurrent_replay (__main__.TestCuda) from test_cuda_expandable_segments.py
test_module_attribute_mutation_violation_negative_1 (__main__.MutationExportTests) from dynamo/test_export_mutations.py
test_module_attribute_mutation_violation_negative_2 (__main__.MutationExportTests) from dynamo/test_export_mutations.py
test_module_attribute_mutation_violation_negative_4 (__main__.MutationExportTests) from dynamo/test_export_mutations.py
test_vmapjvpall_linalg_lu_cuda_float32 (__main__.TestOperatorsCUDA) from functorch/test_ops.py
test_vmapjvpvjp_linalg_lu_cuda_float32 (__main__.TestOperatorsCUDA) from functorch/test_ops.py
test_Conv2d_no_bias_cuda_tf32 (__main__.TestNN) from test_nn.py
test_save_graph_repro (__main__.TestAfterAot) from dynamo/test_after_aot.py
test_doc_examples (__main__.TestTypeHints) from test_type_hints.py
test_caching_pinned_memory (__main__.TestCuda) from test_cuda.py
test_graph_concurrent_replay (__main__.TestCuda) from test_cuda.py
test_non_contiguous_tensors_nn_ConvTranspose1d_cuda_complex32 (__main__.TestModuleCUDA) from test_modules.py
test_pickle_nn_RNN_eval_mode_cuda_float64 (__main__.TestModuleCUDA) from test_modules.py
test_op_has_batch_rule_nn_functional_conv_transpose3d_cuda_float32 (__main__.TestVmapOperatorsOpInfoCUDA) from functorch/test_vmap.py
test_geometric_kstest_cuda_float32 (__main__.TestTorchDeviceTypeCUDA) from test_torch.py
test_profiler_experimental_tree_with_memory (__main__.TestProfilerTree) from profiler/test_profiler_tree.py
test_fs_pool (__main__.TestMultiprocessing) from test_multiprocessing.py
test_forward_mode_AD_linalg_lu_factor_ex_cuda_complex128 (__main__.TestFwdGradientsCUDA) from test_ops_fwd_gradients.py
test_vjp_linalg_lu_cuda_float32 (__main__.TestOperatorsCUDA) from functorch/test_ops.py
test_inplace_grad_fmod_cuda_float64 (__main__.TestBwdGradientsCUDA) from test_ops_gradients.py
test_inplace_gradgrad_remainder_cuda_float64 (__main__.TestBwdGradientsCUDA) from test_ops_gradients.py
test_bottleneck_cuda (__main__.TestBottleneck) from test_utils.py
test_comprehensive_empty_strided_cuda_int32 (__main__.TestInductorOpInfoCUDA) from inductor/test_torchinductor_opinfo.py
test_vmapvjpvjp_linalg_lu_cuda_float32 (__main__.TestOperatorsCUDA) from functorch/test_ops.py
The following 11 are still flaky:
test_transpose_with_norm (__main__.CPUReproTests) from inductor/test_cpu_repro.py, failing 215/215
test_compare_cpu_linalg_pinv_singular_cuda_float32 (__main__.TestCommonCUDA) from test_ops.py, failing 100/100
test_conv_bn_fuse_dynamic_shapes_cpu (__main__.DynamicShapesCodegenCpuTests) from inductor/test_torchinductor_codegen_dynamic_shapes.py, failing 115/115
test_lobpcg (__main__.TestAutograd) from test_autograd.py, failing 50/50
test_module_attribute_mutation_violation_negative_3 (__main__.MutationExportTests) from dynamo/test_export_mutations.py, failing 2/50
test_Conv2d_dilated_cuda_tf32 (__main__.TestNN) from test_nn.py, failing 1/50
test_grad_nn_GroupNorm_cuda_float64 (__main__.TestModuleCUDA) from test_modules.py, failing 50/50
test_index_add_correctness (__main__.TestTorch) from test_torch.py, failing 22/50
test_attn_cuda (__main__.TestMin) from functorch/test_dims.py, failing 1/50
test_open_device_registration (__main__.TestCppExtensionOpenRgistration) from test_cpp_extensions_open_device_registration.py, failing 50/50
test_gradgrad_nn_GroupNorm_cuda_float64 (__main__.TestModuleCUDA) from test_modules.py, failing 50/50
```
* Uploading tests stats for rerunning disabled tests takes only half a minute
```
time python3 -m tools.stats.upload_test_stats --workflow-run-id 5229037746 --workflow-run-attempt 1 --head-branch main
31.94s user 2.94s system 44% cpu 1:19.07 total
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/103476
Approved by: https://github.com/clee2000
2023-06-13 17:07:40 +00:00
|
|
|
|
2025-01-29 23:48:47 +00:00
|
|
|
GHA_ARTIFACTS_BUCKET = "gha-artifacts"
|
|
|
|
|
|
|
|
|
|
|
Fix rerun disabled test uploading logic (#103476)
After https://github.com/pytorch/pytorch/pull/102107, rerunning disabled tests only collect and run disable tests. A side effect of this change is that the skip message `Test is enabled but --rerun-disabled-tests verification mode is set, so only disabled tests are run` isn't in the test report anymore as these non-disabled tests are not going to be collected in the first place. This breaks the logic in the uploading script that depends on this string to know if the test report belongs to a rerunning disabled tests workflow.
* This PR updates the logic in `is_rerun_disabled_tests` check to count the number of times a test is run instead. In rerunning disabled tests mode, a test is run 50 times by default and 15 times for distributed tests (to avoid timeout). Both these numbers are larger than the max number of retries a test can get normally (3 x 3)
* This also removes the hacky `is_rerun_disabled_tests` check in `tools/stats/upload_test_stats.py` as rerun disabled tests reports are now very small (50 x the number of disabled tests)
### Testing
* `test_gradgrad_nn_GroupNorm_cuda_float64` now shows up correctly https://github.com/pytorch/pytorch/issues/98678
```
python3 -m tools.stats.check_disabled_tests --workflow-run-id 5229037746 --workflow-run-attempt 1 --repo "pytorch/pytorch"
Using temporary directory: /var/folders/x4/2kd9r0fn5b9bf_sbcw16fxsc0000gn/T/tmpdojg5vq5
Downloading test-reports-test-default-1-4-linux.g5.4xlarge.nvidia.gpu_14154925022.zip
Downloading test-reports-test-default-1-4-linux.g5.4xlarge.nvidia.gpu_14154925093.zip
Downloading test-reports-test-default-2-4-linux.g5.4xlarge.nvidia.gpu_14154925167.zip
Downloading test-reports-test-default-2-4-linux.g5.4xlarge.nvidia.gpu_14154925226.zip
Downloading test-reports-test-default-3-4-linux.g5.4xlarge.nvidia.gpu_14154925295.zip
Downloading test-reports-test-default-3-4-linux.g5.4xlarge.nvidia.gpu_14154925371.zip
Downloading test-reports-test-default-4-4-linux.g5.4xlarge.nvidia.gpu_14154925453.zip
Downloading test-reports-test-default-4-4-linux.g5.4xlarge.nvidia.gpu_14154925536.zip
Downloading test-reports-test-slow-1-1-linux.2xlarge_14154853469.zip
Downloading test-reports-test-slow-1-1-linux.rocm.gpu_14154932523.zip
Downloading test-reports-test-slow-1-1-linux.rocm.gpu_14154932563.zip
Downloading test-reports-test-slow-1-2-linux.4xlarge_14154873704.zip
Downloading test-reports-test-slow-1-2-linux.g5.4xlarge.nvidia.gpu_14154931154.zip
Downloading test-reports-test-slow-1-2-linux.g5.4xlarge.nvidia.gpu_14154931186.zip
Downloading test-reports-test-slow-2-2-linux.4xlarge_14154873756.zip
Downloading test-reports-test-slow-2-2-linux.g5.4xlarge.nvidia.gpu_14154931225.zip
Downloading test-reports-test-slow-2-2-linux.g5.4xlarge.nvidia.gpu_14154931267.zip
Extracting test-reports-test-default-1-4-linux.g5.4xlarge.nvidia.gpu_14154925022.zip to unzipped-test-reports-test-default-1-4-linux.g5.4xlarge.nvidia.gpu_14154925022
Extracting test-reports-test-default-1-4-linux.g5.4xlarge.nvidia.gpu_14154925093.zip to unzipped-test-reports-test-default-1-4-linux.g5.4xlarge.nvidia.gpu_14154925093
Extracting test-reports-test-default-2-4-linux.g5.4xlarge.nvidia.gpu_14154925167.zip to unzipped-test-reports-test-default-2-4-linux.g5.4xlarge.nvidia.gpu_14154925167
Extracting test-reports-test-default-2-4-linux.g5.4xlarge.nvidia.gpu_14154925226.zip to unzipped-test-reports-test-default-2-4-linux.g5.4xlarge.nvidia.gpu_14154925226
Extracting test-reports-test-default-3-4-linux.g5.4xlarge.nvidia.gpu_14154925295.zip to unzipped-test-reports-test-default-3-4-linux.g5.4xlarge.nvidia.gpu_14154925295
Extracting test-reports-test-default-3-4-linux.g5.4xlarge.nvidia.gpu_14154925371.zip to unzipped-test-reports-test-default-3-4-linux.g5.4xlarge.nvidia.gpu_14154925371
Extracting test-reports-test-default-4-4-linux.g5.4xlarge.nvidia.gpu_14154925453.zip to unzipped-test-reports-test-default-4-4-linux.g5.4xlarge.nvidia.gpu_14154925453
Extracting test-reports-test-default-4-4-linux.g5.4xlarge.nvidia.gpu_14154925536.zip to unzipped-test-reports-test-default-4-4-linux.g5.4xlarge.nvidia.gpu_14154925536
Extracting test-reports-test-slow-1-1-linux.2xlarge_14154853469.zip to unzipped-test-reports-test-slow-1-1-linux.2xlarge_14154853469
Extracting test-reports-test-slow-1-1-linux.rocm.gpu_14154932523.zip to unzipped-test-reports-test-slow-1-1-linux.rocm.gpu_14154932523
Extracting test-reports-test-slow-1-1-linux.rocm.gpu_14154932563.zip to unzipped-test-reports-test-slow-1-1-linux.rocm.gpu_14154932563
Extracting test-reports-test-slow-1-2-linux.4xlarge_14154873704.zip to unzipped-test-reports-test-slow-1-2-linux.4xlarge_14154873704
Extracting test-reports-test-slow-1-2-linux.g5.4xlarge.nvidia.gpu_14154931154.zip to unzipped-test-reports-test-slow-1-2-linux.g5.4xlarge.nvidia.gpu_14154931154
Extracting test-reports-test-slow-1-2-linux.g5.4xlarge.nvidia.gpu_14154931186.zip to unzipped-test-reports-test-slow-1-2-linux.g5.4xlarge.nvidia.gpu_14154931186
Extracting test-reports-test-slow-2-2-linux.4xlarge_14154873756.zip to unzipped-test-reports-test-slow-2-2-linux.4xlarge_14154873756
Extracting test-reports-test-slow-2-2-linux.g5.4xlarge.nvidia.gpu_14154931225.zip to unzipped-test-reports-test-slow-2-2-linux.g5.4xlarge.nvidia.gpu_14154931225
Extracting test-reports-test-slow-2-2-linux.g5.4xlarge.nvidia.gpu_14154931267.zip to unzipped-test-reports-test-slow-2-2-linux.g5.4xlarge.nvidia.gpu_14154931267
Downloading test-reports-runattempt1-test-slow-1-1-linux.rocm.gpu_14154932523.zip
Downloading test-reports-runattempt1-test-slow-1-1-linux.rocm.gpu_14154932563.zip
Extracting test-reports-runattempt1-test-slow-1-1-linux.rocm.gpu_14154932523.zip to unzipped-test-reports-runattempt1-test-slow-1-1-linux.rocm.gpu_14154932523
Extracting test-reports-runattempt1-test-slow-1-1-linux.rocm.gpu_14154932563.zip to unzipped-test-reports-runattempt1-test-slow-1-1-linux.rocm.gpu_14154932563
The following 32 tests should be re-enabled:
test_huge_index (__main__.TestCuda) from test_cuda.py
test_conv_bn_fuse_cpu (__main__.CpuTests) from inductor/test_torchinductor.py
test_multi_threads (__main__.TestTorchrun) from backends/xeon/test_launch.py
test_huge_index (__main__.TestCuda) from test_cuda_expandable_segments.py
test_memory_timeline_no_id (__main__.TestMemoryProfilerE2E) from profiler/test_memory_profiler.py
test_inverse_errors_large_cuda_float64 (__main__.TestLinalgCUDA) from test_linalg.py
test_trace_dependencies (__main__.TestAnalyze) from test_package.py
test_caching_pinned_memory (__main__.TestCuda) from test_cuda_expandable_segments.py
test_graph_concurrent_replay (__main__.TestCuda) from test_cuda_expandable_segments.py
test_module_attribute_mutation_violation_negative_1 (__main__.MutationExportTests) from dynamo/test_export_mutations.py
test_module_attribute_mutation_violation_negative_2 (__main__.MutationExportTests) from dynamo/test_export_mutations.py
test_module_attribute_mutation_violation_negative_4 (__main__.MutationExportTests) from dynamo/test_export_mutations.py
test_vmapjvpall_linalg_lu_cuda_float32 (__main__.TestOperatorsCUDA) from functorch/test_ops.py
test_vmapjvpvjp_linalg_lu_cuda_float32 (__main__.TestOperatorsCUDA) from functorch/test_ops.py
test_Conv2d_no_bias_cuda_tf32 (__main__.TestNN) from test_nn.py
test_save_graph_repro (__main__.TestAfterAot) from dynamo/test_after_aot.py
test_doc_examples (__main__.TestTypeHints) from test_type_hints.py
test_caching_pinned_memory (__main__.TestCuda) from test_cuda.py
test_graph_concurrent_replay (__main__.TestCuda) from test_cuda.py
test_non_contiguous_tensors_nn_ConvTranspose1d_cuda_complex32 (__main__.TestModuleCUDA) from test_modules.py
test_pickle_nn_RNN_eval_mode_cuda_float64 (__main__.TestModuleCUDA) from test_modules.py
test_op_has_batch_rule_nn_functional_conv_transpose3d_cuda_float32 (__main__.TestVmapOperatorsOpInfoCUDA) from functorch/test_vmap.py
test_geometric_kstest_cuda_float32 (__main__.TestTorchDeviceTypeCUDA) from test_torch.py
test_profiler_experimental_tree_with_memory (__main__.TestProfilerTree) from profiler/test_profiler_tree.py
test_fs_pool (__main__.TestMultiprocessing) from test_multiprocessing.py
test_forward_mode_AD_linalg_lu_factor_ex_cuda_complex128 (__main__.TestFwdGradientsCUDA) from test_ops_fwd_gradients.py
test_vjp_linalg_lu_cuda_float32 (__main__.TestOperatorsCUDA) from functorch/test_ops.py
test_inplace_grad_fmod_cuda_float64 (__main__.TestBwdGradientsCUDA) from test_ops_gradients.py
test_inplace_gradgrad_remainder_cuda_float64 (__main__.TestBwdGradientsCUDA) from test_ops_gradients.py
test_bottleneck_cuda (__main__.TestBottleneck) from test_utils.py
test_comprehensive_empty_strided_cuda_int32 (__main__.TestInductorOpInfoCUDA) from inductor/test_torchinductor_opinfo.py
test_vmapvjpvjp_linalg_lu_cuda_float32 (__main__.TestOperatorsCUDA) from functorch/test_ops.py
The following 11 are still flaky:
test_transpose_with_norm (__main__.CPUReproTests) from inductor/test_cpu_repro.py, failing 215/215
test_compare_cpu_linalg_pinv_singular_cuda_float32 (__main__.TestCommonCUDA) from test_ops.py, failing 100/100
test_conv_bn_fuse_dynamic_shapes_cpu (__main__.DynamicShapesCodegenCpuTests) from inductor/test_torchinductor_codegen_dynamic_shapes.py, failing 115/115
test_lobpcg (__main__.TestAutograd) from test_autograd.py, failing 50/50
test_module_attribute_mutation_violation_negative_3 (__main__.MutationExportTests) from dynamo/test_export_mutations.py, failing 2/50
test_Conv2d_dilated_cuda_tf32 (__main__.TestNN) from test_nn.py, failing 1/50
test_grad_nn_GroupNorm_cuda_float64 (__main__.TestModuleCUDA) from test_modules.py, failing 50/50
test_index_add_correctness (__main__.TestTorch) from test_torch.py, failing 22/50
test_attn_cuda (__main__.TestMin) from functorch/test_dims.py, failing 1/50
test_open_device_registration (__main__.TestCppExtensionOpenRgistration) from test_cpp_extensions_open_device_registration.py, failing 50/50
test_gradgrad_nn_GroupNorm_cuda_float64 (__main__.TestModuleCUDA) from test_modules.py, failing 50/50
```
* Uploading tests stats for rerunning disabled tests takes only half a minute
```
time python3 -m tools.stats.upload_test_stats --workflow-run-id 5229037746 --workflow-run-attempt 1 --head-branch main
31.94s user 2.94s system 44% cpu 1:19.07 total
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/103476
Approved by: https://github.com/clee2000
2023-06-13 17:07:40 +00:00
|
|
|
# NB: In CI, a flaky test is usually retried 3 times, then the test file would be rerun
|
|
|
|
|
# 2 more times
|
|
|
|
|
MAX_RETRY_IN_NON_DISABLED_MODE = 3 * 3
|
2022-06-12 17:29:01 +00:00
|
|
|
|
|
|
|
|
|
2024-06-29 04:48:06 +00:00
|
|
|
def _get_request_headers() -> dict[str, str]:
|
2022-06-12 17:29:01 +00:00
|
|
|
return {
|
|
|
|
|
"Accept": "application/vnd.github.v3+json",
|
|
|
|
|
"Authorization": "token " + os.environ["GITHUB_TOKEN"],
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
2024-06-29 04:48:06 +00:00
|
|
|
def _get_artifact_urls(prefix: str, workflow_run_id: int) -> dict[Path, str]:
|
2022-06-12 17:29:01 +00:00
|
|
|
"""Get all workflow artifacts with 'test-report' in the name."""
|
|
|
|
|
response = requests.get(
|
|
|
|
|
f"{PYTORCH_REPO}/actions/runs/{workflow_run_id}/artifacts?per_page=100",
|
2024-04-22 20:19:35 +00:00
|
|
|
headers=_get_request_headers(),
|
2022-06-12 17:29:01 +00:00
|
|
|
)
|
|
|
|
|
artifacts = response.json()["artifacts"]
|
|
|
|
|
while "next" in response.links.keys():
|
|
|
|
|
response = requests.get(
|
|
|
|
|
response.links["next"]["url"], headers=_get_request_headers()
|
|
|
|
|
)
|
|
|
|
|
artifacts.extend(response.json()["artifacts"])
|
|
|
|
|
|
|
|
|
|
artifact_urls = {}
|
|
|
|
|
for artifact in artifacts:
|
|
|
|
|
if artifact["name"].startswith(prefix):
|
|
|
|
|
artifact_urls[Path(artifact["name"])] = artifact["archive_download_url"]
|
|
|
|
|
return artifact_urls
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _download_artifact(
|
|
|
|
|
artifact_name: Path, artifact_url: str, workflow_run_attempt: int
|
|
|
|
|
) -> Path:
|
|
|
|
|
# [Artifact run attempt]
|
|
|
|
|
# All artifacts on a workflow share a single namespace. However, we can
|
|
|
|
|
# re-run a workflow and produce a new set of artifacts. To avoid name
|
|
|
|
|
# collisions, we add `-runattempt1<run #>-` somewhere in the artifact name.
|
|
|
|
|
#
|
|
|
|
|
# This code parses out the run attempt number from the artifact name. If it
|
|
|
|
|
# doesn't match the one specified on the command line, skip it.
|
|
|
|
|
atoms = str(artifact_name).split("-")
|
|
|
|
|
for atom in atoms:
|
|
|
|
|
if atom.startswith("runattempt"):
|
|
|
|
|
found_run_attempt = int(atom[len("runattempt") :])
|
|
|
|
|
if workflow_run_attempt != found_run_attempt:
|
|
|
|
|
print(
|
|
|
|
|
f"Skipping {artifact_name} as it is an invalid run attempt. "
|
|
|
|
|
f"Expected {workflow_run_attempt}, found {found_run_attempt}."
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
print(f"Downloading {artifact_name}")
|
|
|
|
|
|
|
|
|
|
response = requests.get(artifact_url, headers=_get_request_headers())
|
|
|
|
|
with open(artifact_name, "wb") as f:
|
|
|
|
|
f.write(response.content)
|
|
|
|
|
return artifact_name
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def download_s3_artifacts(
|
2025-01-29 23:48:47 +00:00
|
|
|
prefix: str,
|
|
|
|
|
workflow_run_id: int,
|
|
|
|
|
workflow_run_attempt: int,
|
|
|
|
|
job_id: Optional[int] = None,
|
2024-06-29 04:48:06 +00:00
|
|
|
) -> list[Path]:
|
2025-01-29 23:48:47 +00:00
|
|
|
bucket = get_s3_resource().Bucket(GHA_ARTIFACTS_BUCKET)
|
2022-06-12 17:29:01 +00:00
|
|
|
objs = bucket.objects.filter(
|
|
|
|
|
Prefix=f"pytorch/pytorch/{workflow_run_id}/{workflow_run_attempt}/artifact/{prefix}"
|
|
|
|
|
)
|
|
|
|
|
found_one = False
|
|
|
|
|
paths = []
|
|
|
|
|
for obj in objs:
|
2025-01-29 23:48:47 +00:00
|
|
|
object_name = Path(obj.key).name
|
|
|
|
|
# target an artifact for a specific job_id if provided, otherwise skip the download.
|
|
|
|
|
if job_id is not None and str(job_id) not in object_name:
|
|
|
|
|
continue
|
2022-06-12 17:29:01 +00:00
|
|
|
found_one = True
|
|
|
|
|
p = Path(Path(obj.key).name)
|
|
|
|
|
print(f"Downloading {p}")
|
|
|
|
|
with open(p, "wb") as f:
|
|
|
|
|
f.write(obj.get()["Body"].read())
|
|
|
|
|
paths.append(p)
|
|
|
|
|
|
|
|
|
|
if not found_one:
|
2022-06-15 17:04:23 +00:00
|
|
|
print(
|
|
|
|
|
"::warning title=s3 artifacts not found::"
|
|
|
|
|
"Didn't find any test reports in s3, there might be a bug!"
|
2022-06-12 17:29:01 +00:00
|
|
|
)
|
|
|
|
|
return paths
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def download_gha_artifacts(
|
|
|
|
|
prefix: str, workflow_run_id: int, workflow_run_attempt: int
|
2024-06-29 04:48:06 +00:00
|
|
|
) -> list[Path]:
|
2022-06-12 17:29:01 +00:00
|
|
|
artifact_urls = _get_artifact_urls(prefix, workflow_run_id)
|
|
|
|
|
paths = []
|
|
|
|
|
for name, url in artifact_urls.items():
|
|
|
|
|
paths.append(_download_artifact(Path(name), url, workflow_run_attempt))
|
|
|
|
|
return paths
|
|
|
|
|
|
|
|
|
|
|
2024-07-05 16:31:49 +00:00
|
|
|
def upload_to_dynamodb(
|
|
|
|
|
dynamodb_table: str,
|
|
|
|
|
repo: str,
|
PEP585 update - benchmarks tools torchgen (#145101)
This is one of a series of PRs to update us to PEP585 (changing Dict -> dict, List -> list, etc). Most of the PRs were completely automated with RUFF as follows:
Since RUFF UP006 is considered an "unsafe" fix first we need to enable unsafe fixes:
```
--- a/tools/linter/adapters/ruff_linter.py
+++ b/tools/linter/adapters/ruff_linter.py
@@ -313,6 +313,7 @@
"ruff",
"check",
"--fix-only",
+ "--unsafe-fixes",
"--exit-zero",
*([f"--config={config}"] if config else []),
"--stdin-filename",
```
Then we need to tell RUFF to allow UP006 (as a final PR once all of these have landed this will be made permanent):
```
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -40,7 +40,7 @@
[tool.ruff]
-target-version = "py38"
+target-version = "py39"
line-length = 88
src = ["caffe2", "torch", "torchgen", "functorch", "test"]
@@ -87,7 +87,6 @@
"SIM116", # Disable Use a dictionary instead of consecutive `if` statements
"SIM117",
"SIM118",
- "UP006", # keep-runtime-typing
"UP007", # keep-runtime-typing
]
select = [
```
Finally running `lintrunner -a --take RUFF` will fix up the deprecated uses.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/145101
Approved by: https://github.com/bobrenjc93
2025-01-17 23:13:25 +00:00
|
|
|
docs: list[Any],
|
|
|
|
|
generate_partition_key: Optional[Callable[[str, dict[str, Any]], str]],
|
2024-07-05 16:31:49 +00:00
|
|
|
) -> None:
|
|
|
|
|
print(f"Writing {len(docs)} documents to DynamoDB {dynamodb_table}")
|
|
|
|
|
# https://boto3.amazonaws.com/v1/documentation/api/latest/guide/dynamodb.html#batch-writing
|
|
|
|
|
with boto3.resource("dynamodb").Table(dynamodb_table).batch_writer() as batch:
|
|
|
|
|
for doc in docs:
|
|
|
|
|
if generate_partition_key:
|
|
|
|
|
doc["dynamoKey"] = generate_partition_key(repo, doc)
|
2024-07-08 22:58:32 +00:00
|
|
|
# This is to move away the _event_time field from Rockset, which we cannot use when
|
|
|
|
|
# reimport the data
|
|
|
|
|
doc["timestamp"] = int(round(time.time() * 1000))
|
2024-07-05 16:31:49 +00:00
|
|
|
batch.put_item(Item=doc)
|
|
|
|
|
|
|
|
|
|
|
2022-06-30 07:27:44 +00:00
|
|
|
def upload_to_s3(
|
2023-03-02 18:36:20 +00:00
|
|
|
bucket_name: str,
|
|
|
|
|
key: str,
|
2024-06-29 04:48:06 +00:00
|
|
|
docs: list[dict[str, Any]],
|
2022-06-30 07:27:44 +00:00
|
|
|
) -> None:
|
|
|
|
|
print(f"Writing {len(docs)} documents to S3")
|
|
|
|
|
body = io.StringIO()
|
|
|
|
|
for doc in docs:
|
|
|
|
|
json.dump(doc, body)
|
|
|
|
|
body.write("\n")
|
|
|
|
|
|
2024-10-02 23:19:28 +00:00
|
|
|
get_s3_resource().Object(
|
2023-03-15 02:46:45 +00:00
|
|
|
f"{bucket_name}",
|
|
|
|
|
f"{key}",
|
|
|
|
|
).put(
|
2022-06-30 07:27:44 +00:00
|
|
|
Body=gzip.compress(body.getvalue().encode()),
|
|
|
|
|
ContentEncoding="gzip",
|
|
|
|
|
ContentType="application/json",
|
|
|
|
|
)
|
|
|
|
|
print("Done!")
|
|
|
|
|
|
|
|
|
|
|
2023-05-15 23:46:50 +00:00
|
|
|
def read_from_s3(
|
|
|
|
|
bucket_name: str,
|
|
|
|
|
key: str,
|
2024-06-29 04:48:06 +00:00
|
|
|
) -> list[dict[str, Any]]:
|
2023-05-15 23:46:50 +00:00
|
|
|
print(f"Reading from s3://{bucket_name}/{key}")
|
|
|
|
|
body = (
|
2024-10-02 23:19:28 +00:00
|
|
|
get_s3_resource()
|
|
|
|
|
.Object(
|
2023-05-15 23:46:50 +00:00
|
|
|
f"{bucket_name}",
|
|
|
|
|
f"{key}",
|
|
|
|
|
)
|
|
|
|
|
.get()["Body"]
|
|
|
|
|
.read()
|
|
|
|
|
)
|
|
|
|
|
results = gzip.decompress(body).decode().split("\n")
|
|
|
|
|
return [json.loads(result) for result in results if result]
|
|
|
|
|
|
|
|
|
|
|
2024-10-01 21:47:46 +00:00
|
|
|
def remove_nan_inf(old: Any) -> Any:
|
|
|
|
|
# Casta NaN, inf, -inf to string from float since json.dumps outputs invalid
|
|
|
|
|
# json with them
|
|
|
|
|
def _helper(o: Any) -> Any:
|
|
|
|
|
if isinstance(o, float) and (math.isinf(o) or math.isnan(o)):
|
|
|
|
|
return str(o)
|
|
|
|
|
if isinstance(o, list):
|
|
|
|
|
return [_helper(v) for v in o]
|
|
|
|
|
if isinstance(o, dict):
|
|
|
|
|
return {_helper(k): _helper(v) for k, v in o.items()}
|
|
|
|
|
if isinstance(o, tuple):
|
|
|
|
|
return tuple(_helper(v) for v in o)
|
|
|
|
|
return o
|
|
|
|
|
|
|
|
|
|
return _helper(old)
|
|
|
|
|
|
|
|
|
|
|
2023-03-02 18:36:20 +00:00
|
|
|
def upload_workflow_stats_to_s3(
|
|
|
|
|
workflow_run_id: int,
|
|
|
|
|
workflow_run_attempt: int,
|
|
|
|
|
collection: str,
|
2024-06-29 04:48:06 +00:00
|
|
|
docs: list[dict[str, Any]],
|
2023-03-02 18:36:20 +00:00
|
|
|
) -> None:
|
|
|
|
|
bucket_name = "ossci-raw-job-status"
|
|
|
|
|
key = f"{collection}/{workflow_run_id}/{workflow_run_attempt}"
|
|
|
|
|
upload_to_s3(bucket_name, key, docs)
|
|
|
|
|
|
|
|
|
|
|
2022-10-29 17:40:07 +00:00
|
|
|
def upload_file_to_s3(
|
|
|
|
|
file_name: str,
|
|
|
|
|
bucket: str,
|
|
|
|
|
key: str,
|
|
|
|
|
) -> None:
|
|
|
|
|
"""
|
|
|
|
|
Upload a local file to S3
|
|
|
|
|
"""
|
|
|
|
|
print(f"Upload {file_name} to s3://{bucket}/{key}")
|
|
|
|
|
boto3.client("s3").upload_file(
|
|
|
|
|
file_name,
|
|
|
|
|
bucket,
|
|
|
|
|
key,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
2022-06-12 17:29:01 +00:00
|
|
|
def unzip(p: Path) -> None:
|
|
|
|
|
"""Unzip the provided zipfile to a similarly-named directory.
|
|
|
|
|
|
|
|
|
|
Returns None if `p` is not a zipfile.
|
|
|
|
|
|
|
|
|
|
Looks like: /tmp/test-reports.zip -> /tmp/unzipped-test-reports/
|
|
|
|
|
"""
|
|
|
|
|
assert p.is_file()
|
|
|
|
|
unzipped_dir = p.with_name("unzipped-" + p.stem)
|
|
|
|
|
print(f"Extracting {p} to {unzipped_dir}")
|
|
|
|
|
|
|
|
|
|
with zipfile.ZipFile(p, "r") as zip:
|
|
|
|
|
zip.extractall(unzipped_dir)
|
2022-11-23 22:39:36 +00:00
|
|
|
|
|
|
|
|
|
2024-06-29 04:48:06 +00:00
|
|
|
def is_rerun_disabled_tests(tests: dict[str, dict[str, int]]) -> bool:
|
2022-11-23 22:39:36 +00:00
|
|
|
"""
|
Fix rerun disabled test uploading logic (#103476)
After https://github.com/pytorch/pytorch/pull/102107, rerunning disabled tests only collect and run disable tests. A side effect of this change is that the skip message `Test is enabled but --rerun-disabled-tests verification mode is set, so only disabled tests are run` isn't in the test report anymore as these non-disabled tests are not going to be collected in the first place. This breaks the logic in the uploading script that depends on this string to know if the test report belongs to a rerunning disabled tests workflow.
* This PR updates the logic in `is_rerun_disabled_tests` check to count the number of times a test is run instead. In rerunning disabled tests mode, a test is run 50 times by default and 15 times for distributed tests (to avoid timeout). Both these numbers are larger than the max number of retries a test can get normally (3 x 3)
* This also removes the hacky `is_rerun_disabled_tests` check in `tools/stats/upload_test_stats.py` as rerun disabled tests reports are now very small (50 x the number of disabled tests)
### Testing
* `test_gradgrad_nn_GroupNorm_cuda_float64` now shows up correctly https://github.com/pytorch/pytorch/issues/98678
```
python3 -m tools.stats.check_disabled_tests --workflow-run-id 5229037746 --workflow-run-attempt 1 --repo "pytorch/pytorch"
Using temporary directory: /var/folders/x4/2kd9r0fn5b9bf_sbcw16fxsc0000gn/T/tmpdojg5vq5
Downloading test-reports-test-default-1-4-linux.g5.4xlarge.nvidia.gpu_14154925022.zip
Downloading test-reports-test-default-1-4-linux.g5.4xlarge.nvidia.gpu_14154925093.zip
Downloading test-reports-test-default-2-4-linux.g5.4xlarge.nvidia.gpu_14154925167.zip
Downloading test-reports-test-default-2-4-linux.g5.4xlarge.nvidia.gpu_14154925226.zip
Downloading test-reports-test-default-3-4-linux.g5.4xlarge.nvidia.gpu_14154925295.zip
Downloading test-reports-test-default-3-4-linux.g5.4xlarge.nvidia.gpu_14154925371.zip
Downloading test-reports-test-default-4-4-linux.g5.4xlarge.nvidia.gpu_14154925453.zip
Downloading test-reports-test-default-4-4-linux.g5.4xlarge.nvidia.gpu_14154925536.zip
Downloading test-reports-test-slow-1-1-linux.2xlarge_14154853469.zip
Downloading test-reports-test-slow-1-1-linux.rocm.gpu_14154932523.zip
Downloading test-reports-test-slow-1-1-linux.rocm.gpu_14154932563.zip
Downloading test-reports-test-slow-1-2-linux.4xlarge_14154873704.zip
Downloading test-reports-test-slow-1-2-linux.g5.4xlarge.nvidia.gpu_14154931154.zip
Downloading test-reports-test-slow-1-2-linux.g5.4xlarge.nvidia.gpu_14154931186.zip
Downloading test-reports-test-slow-2-2-linux.4xlarge_14154873756.zip
Downloading test-reports-test-slow-2-2-linux.g5.4xlarge.nvidia.gpu_14154931225.zip
Downloading test-reports-test-slow-2-2-linux.g5.4xlarge.nvidia.gpu_14154931267.zip
Extracting test-reports-test-default-1-4-linux.g5.4xlarge.nvidia.gpu_14154925022.zip to unzipped-test-reports-test-default-1-4-linux.g5.4xlarge.nvidia.gpu_14154925022
Extracting test-reports-test-default-1-4-linux.g5.4xlarge.nvidia.gpu_14154925093.zip to unzipped-test-reports-test-default-1-4-linux.g5.4xlarge.nvidia.gpu_14154925093
Extracting test-reports-test-default-2-4-linux.g5.4xlarge.nvidia.gpu_14154925167.zip to unzipped-test-reports-test-default-2-4-linux.g5.4xlarge.nvidia.gpu_14154925167
Extracting test-reports-test-default-2-4-linux.g5.4xlarge.nvidia.gpu_14154925226.zip to unzipped-test-reports-test-default-2-4-linux.g5.4xlarge.nvidia.gpu_14154925226
Extracting test-reports-test-default-3-4-linux.g5.4xlarge.nvidia.gpu_14154925295.zip to unzipped-test-reports-test-default-3-4-linux.g5.4xlarge.nvidia.gpu_14154925295
Extracting test-reports-test-default-3-4-linux.g5.4xlarge.nvidia.gpu_14154925371.zip to unzipped-test-reports-test-default-3-4-linux.g5.4xlarge.nvidia.gpu_14154925371
Extracting test-reports-test-default-4-4-linux.g5.4xlarge.nvidia.gpu_14154925453.zip to unzipped-test-reports-test-default-4-4-linux.g5.4xlarge.nvidia.gpu_14154925453
Extracting test-reports-test-default-4-4-linux.g5.4xlarge.nvidia.gpu_14154925536.zip to unzipped-test-reports-test-default-4-4-linux.g5.4xlarge.nvidia.gpu_14154925536
Extracting test-reports-test-slow-1-1-linux.2xlarge_14154853469.zip to unzipped-test-reports-test-slow-1-1-linux.2xlarge_14154853469
Extracting test-reports-test-slow-1-1-linux.rocm.gpu_14154932523.zip to unzipped-test-reports-test-slow-1-1-linux.rocm.gpu_14154932523
Extracting test-reports-test-slow-1-1-linux.rocm.gpu_14154932563.zip to unzipped-test-reports-test-slow-1-1-linux.rocm.gpu_14154932563
Extracting test-reports-test-slow-1-2-linux.4xlarge_14154873704.zip to unzipped-test-reports-test-slow-1-2-linux.4xlarge_14154873704
Extracting test-reports-test-slow-1-2-linux.g5.4xlarge.nvidia.gpu_14154931154.zip to unzipped-test-reports-test-slow-1-2-linux.g5.4xlarge.nvidia.gpu_14154931154
Extracting test-reports-test-slow-1-2-linux.g5.4xlarge.nvidia.gpu_14154931186.zip to unzipped-test-reports-test-slow-1-2-linux.g5.4xlarge.nvidia.gpu_14154931186
Extracting test-reports-test-slow-2-2-linux.4xlarge_14154873756.zip to unzipped-test-reports-test-slow-2-2-linux.4xlarge_14154873756
Extracting test-reports-test-slow-2-2-linux.g5.4xlarge.nvidia.gpu_14154931225.zip to unzipped-test-reports-test-slow-2-2-linux.g5.4xlarge.nvidia.gpu_14154931225
Extracting test-reports-test-slow-2-2-linux.g5.4xlarge.nvidia.gpu_14154931267.zip to unzipped-test-reports-test-slow-2-2-linux.g5.4xlarge.nvidia.gpu_14154931267
Downloading test-reports-runattempt1-test-slow-1-1-linux.rocm.gpu_14154932523.zip
Downloading test-reports-runattempt1-test-slow-1-1-linux.rocm.gpu_14154932563.zip
Extracting test-reports-runattempt1-test-slow-1-1-linux.rocm.gpu_14154932523.zip to unzipped-test-reports-runattempt1-test-slow-1-1-linux.rocm.gpu_14154932523
Extracting test-reports-runattempt1-test-slow-1-1-linux.rocm.gpu_14154932563.zip to unzipped-test-reports-runattempt1-test-slow-1-1-linux.rocm.gpu_14154932563
The following 32 tests should be re-enabled:
test_huge_index (__main__.TestCuda) from test_cuda.py
test_conv_bn_fuse_cpu (__main__.CpuTests) from inductor/test_torchinductor.py
test_multi_threads (__main__.TestTorchrun) from backends/xeon/test_launch.py
test_huge_index (__main__.TestCuda) from test_cuda_expandable_segments.py
test_memory_timeline_no_id (__main__.TestMemoryProfilerE2E) from profiler/test_memory_profiler.py
test_inverse_errors_large_cuda_float64 (__main__.TestLinalgCUDA) from test_linalg.py
test_trace_dependencies (__main__.TestAnalyze) from test_package.py
test_caching_pinned_memory (__main__.TestCuda) from test_cuda_expandable_segments.py
test_graph_concurrent_replay (__main__.TestCuda) from test_cuda_expandable_segments.py
test_module_attribute_mutation_violation_negative_1 (__main__.MutationExportTests) from dynamo/test_export_mutations.py
test_module_attribute_mutation_violation_negative_2 (__main__.MutationExportTests) from dynamo/test_export_mutations.py
test_module_attribute_mutation_violation_negative_4 (__main__.MutationExportTests) from dynamo/test_export_mutations.py
test_vmapjvpall_linalg_lu_cuda_float32 (__main__.TestOperatorsCUDA) from functorch/test_ops.py
test_vmapjvpvjp_linalg_lu_cuda_float32 (__main__.TestOperatorsCUDA) from functorch/test_ops.py
test_Conv2d_no_bias_cuda_tf32 (__main__.TestNN) from test_nn.py
test_save_graph_repro (__main__.TestAfterAot) from dynamo/test_after_aot.py
test_doc_examples (__main__.TestTypeHints) from test_type_hints.py
test_caching_pinned_memory (__main__.TestCuda) from test_cuda.py
test_graph_concurrent_replay (__main__.TestCuda) from test_cuda.py
test_non_contiguous_tensors_nn_ConvTranspose1d_cuda_complex32 (__main__.TestModuleCUDA) from test_modules.py
test_pickle_nn_RNN_eval_mode_cuda_float64 (__main__.TestModuleCUDA) from test_modules.py
test_op_has_batch_rule_nn_functional_conv_transpose3d_cuda_float32 (__main__.TestVmapOperatorsOpInfoCUDA) from functorch/test_vmap.py
test_geometric_kstest_cuda_float32 (__main__.TestTorchDeviceTypeCUDA) from test_torch.py
test_profiler_experimental_tree_with_memory (__main__.TestProfilerTree) from profiler/test_profiler_tree.py
test_fs_pool (__main__.TestMultiprocessing) from test_multiprocessing.py
test_forward_mode_AD_linalg_lu_factor_ex_cuda_complex128 (__main__.TestFwdGradientsCUDA) from test_ops_fwd_gradients.py
test_vjp_linalg_lu_cuda_float32 (__main__.TestOperatorsCUDA) from functorch/test_ops.py
test_inplace_grad_fmod_cuda_float64 (__main__.TestBwdGradientsCUDA) from test_ops_gradients.py
test_inplace_gradgrad_remainder_cuda_float64 (__main__.TestBwdGradientsCUDA) from test_ops_gradients.py
test_bottleneck_cuda (__main__.TestBottleneck) from test_utils.py
test_comprehensive_empty_strided_cuda_int32 (__main__.TestInductorOpInfoCUDA) from inductor/test_torchinductor_opinfo.py
test_vmapvjpvjp_linalg_lu_cuda_float32 (__main__.TestOperatorsCUDA) from functorch/test_ops.py
The following 11 are still flaky:
test_transpose_with_norm (__main__.CPUReproTests) from inductor/test_cpu_repro.py, failing 215/215
test_compare_cpu_linalg_pinv_singular_cuda_float32 (__main__.TestCommonCUDA) from test_ops.py, failing 100/100
test_conv_bn_fuse_dynamic_shapes_cpu (__main__.DynamicShapesCodegenCpuTests) from inductor/test_torchinductor_codegen_dynamic_shapes.py, failing 115/115
test_lobpcg (__main__.TestAutograd) from test_autograd.py, failing 50/50
test_module_attribute_mutation_violation_negative_3 (__main__.MutationExportTests) from dynamo/test_export_mutations.py, failing 2/50
test_Conv2d_dilated_cuda_tf32 (__main__.TestNN) from test_nn.py, failing 1/50
test_grad_nn_GroupNorm_cuda_float64 (__main__.TestModuleCUDA) from test_modules.py, failing 50/50
test_index_add_correctness (__main__.TestTorch) from test_torch.py, failing 22/50
test_attn_cuda (__main__.TestMin) from functorch/test_dims.py, failing 1/50
test_open_device_registration (__main__.TestCppExtensionOpenRgistration) from test_cpp_extensions_open_device_registration.py, failing 50/50
test_gradgrad_nn_GroupNorm_cuda_float64 (__main__.TestModuleCUDA) from test_modules.py, failing 50/50
```
* Uploading tests stats for rerunning disabled tests takes only half a minute
```
time python3 -m tools.stats.upload_test_stats --workflow-run-id 5229037746 --workflow-run-attempt 1 --head-branch main
31.94s user 2.94s system 44% cpu 1:19.07 total
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/103476
Approved by: https://github.com/clee2000
2023-06-13 17:07:40 +00:00
|
|
|
Check if the test report is coming from rerun_disabled_tests workflow where
|
|
|
|
|
each test is run multiple times
|
2022-11-23 22:39:36 +00:00
|
|
|
"""
|
Fix rerun disabled test uploading logic (#103476)
After https://github.com/pytorch/pytorch/pull/102107, rerunning disabled tests only collect and run disable tests. A side effect of this change is that the skip message `Test is enabled but --rerun-disabled-tests verification mode is set, so only disabled tests are run` isn't in the test report anymore as these non-disabled tests are not going to be collected in the first place. This breaks the logic in the uploading script that depends on this string to know if the test report belongs to a rerunning disabled tests workflow.
* This PR updates the logic in `is_rerun_disabled_tests` check to count the number of times a test is run instead. In rerunning disabled tests mode, a test is run 50 times by default and 15 times for distributed tests (to avoid timeout). Both these numbers are larger than the max number of retries a test can get normally (3 x 3)
* This also removes the hacky `is_rerun_disabled_tests` check in `tools/stats/upload_test_stats.py` as rerun disabled tests reports are now very small (50 x the number of disabled tests)
### Testing
* `test_gradgrad_nn_GroupNorm_cuda_float64` now shows up correctly https://github.com/pytorch/pytorch/issues/98678
```
python3 -m tools.stats.check_disabled_tests --workflow-run-id 5229037746 --workflow-run-attempt 1 --repo "pytorch/pytorch"
Using temporary directory: /var/folders/x4/2kd9r0fn5b9bf_sbcw16fxsc0000gn/T/tmpdojg5vq5
Downloading test-reports-test-default-1-4-linux.g5.4xlarge.nvidia.gpu_14154925022.zip
Downloading test-reports-test-default-1-4-linux.g5.4xlarge.nvidia.gpu_14154925093.zip
Downloading test-reports-test-default-2-4-linux.g5.4xlarge.nvidia.gpu_14154925167.zip
Downloading test-reports-test-default-2-4-linux.g5.4xlarge.nvidia.gpu_14154925226.zip
Downloading test-reports-test-default-3-4-linux.g5.4xlarge.nvidia.gpu_14154925295.zip
Downloading test-reports-test-default-3-4-linux.g5.4xlarge.nvidia.gpu_14154925371.zip
Downloading test-reports-test-default-4-4-linux.g5.4xlarge.nvidia.gpu_14154925453.zip
Downloading test-reports-test-default-4-4-linux.g5.4xlarge.nvidia.gpu_14154925536.zip
Downloading test-reports-test-slow-1-1-linux.2xlarge_14154853469.zip
Downloading test-reports-test-slow-1-1-linux.rocm.gpu_14154932523.zip
Downloading test-reports-test-slow-1-1-linux.rocm.gpu_14154932563.zip
Downloading test-reports-test-slow-1-2-linux.4xlarge_14154873704.zip
Downloading test-reports-test-slow-1-2-linux.g5.4xlarge.nvidia.gpu_14154931154.zip
Downloading test-reports-test-slow-1-2-linux.g5.4xlarge.nvidia.gpu_14154931186.zip
Downloading test-reports-test-slow-2-2-linux.4xlarge_14154873756.zip
Downloading test-reports-test-slow-2-2-linux.g5.4xlarge.nvidia.gpu_14154931225.zip
Downloading test-reports-test-slow-2-2-linux.g5.4xlarge.nvidia.gpu_14154931267.zip
Extracting test-reports-test-default-1-4-linux.g5.4xlarge.nvidia.gpu_14154925022.zip to unzipped-test-reports-test-default-1-4-linux.g5.4xlarge.nvidia.gpu_14154925022
Extracting test-reports-test-default-1-4-linux.g5.4xlarge.nvidia.gpu_14154925093.zip to unzipped-test-reports-test-default-1-4-linux.g5.4xlarge.nvidia.gpu_14154925093
Extracting test-reports-test-default-2-4-linux.g5.4xlarge.nvidia.gpu_14154925167.zip to unzipped-test-reports-test-default-2-4-linux.g5.4xlarge.nvidia.gpu_14154925167
Extracting test-reports-test-default-2-4-linux.g5.4xlarge.nvidia.gpu_14154925226.zip to unzipped-test-reports-test-default-2-4-linux.g5.4xlarge.nvidia.gpu_14154925226
Extracting test-reports-test-default-3-4-linux.g5.4xlarge.nvidia.gpu_14154925295.zip to unzipped-test-reports-test-default-3-4-linux.g5.4xlarge.nvidia.gpu_14154925295
Extracting test-reports-test-default-3-4-linux.g5.4xlarge.nvidia.gpu_14154925371.zip to unzipped-test-reports-test-default-3-4-linux.g5.4xlarge.nvidia.gpu_14154925371
Extracting test-reports-test-default-4-4-linux.g5.4xlarge.nvidia.gpu_14154925453.zip to unzipped-test-reports-test-default-4-4-linux.g5.4xlarge.nvidia.gpu_14154925453
Extracting test-reports-test-default-4-4-linux.g5.4xlarge.nvidia.gpu_14154925536.zip to unzipped-test-reports-test-default-4-4-linux.g5.4xlarge.nvidia.gpu_14154925536
Extracting test-reports-test-slow-1-1-linux.2xlarge_14154853469.zip to unzipped-test-reports-test-slow-1-1-linux.2xlarge_14154853469
Extracting test-reports-test-slow-1-1-linux.rocm.gpu_14154932523.zip to unzipped-test-reports-test-slow-1-1-linux.rocm.gpu_14154932523
Extracting test-reports-test-slow-1-1-linux.rocm.gpu_14154932563.zip to unzipped-test-reports-test-slow-1-1-linux.rocm.gpu_14154932563
Extracting test-reports-test-slow-1-2-linux.4xlarge_14154873704.zip to unzipped-test-reports-test-slow-1-2-linux.4xlarge_14154873704
Extracting test-reports-test-slow-1-2-linux.g5.4xlarge.nvidia.gpu_14154931154.zip to unzipped-test-reports-test-slow-1-2-linux.g5.4xlarge.nvidia.gpu_14154931154
Extracting test-reports-test-slow-1-2-linux.g5.4xlarge.nvidia.gpu_14154931186.zip to unzipped-test-reports-test-slow-1-2-linux.g5.4xlarge.nvidia.gpu_14154931186
Extracting test-reports-test-slow-2-2-linux.4xlarge_14154873756.zip to unzipped-test-reports-test-slow-2-2-linux.4xlarge_14154873756
Extracting test-reports-test-slow-2-2-linux.g5.4xlarge.nvidia.gpu_14154931225.zip to unzipped-test-reports-test-slow-2-2-linux.g5.4xlarge.nvidia.gpu_14154931225
Extracting test-reports-test-slow-2-2-linux.g5.4xlarge.nvidia.gpu_14154931267.zip to unzipped-test-reports-test-slow-2-2-linux.g5.4xlarge.nvidia.gpu_14154931267
Downloading test-reports-runattempt1-test-slow-1-1-linux.rocm.gpu_14154932523.zip
Downloading test-reports-runattempt1-test-slow-1-1-linux.rocm.gpu_14154932563.zip
Extracting test-reports-runattempt1-test-slow-1-1-linux.rocm.gpu_14154932523.zip to unzipped-test-reports-runattempt1-test-slow-1-1-linux.rocm.gpu_14154932523
Extracting test-reports-runattempt1-test-slow-1-1-linux.rocm.gpu_14154932563.zip to unzipped-test-reports-runattempt1-test-slow-1-1-linux.rocm.gpu_14154932563
The following 32 tests should be re-enabled:
test_huge_index (__main__.TestCuda) from test_cuda.py
test_conv_bn_fuse_cpu (__main__.CpuTests) from inductor/test_torchinductor.py
test_multi_threads (__main__.TestTorchrun) from backends/xeon/test_launch.py
test_huge_index (__main__.TestCuda) from test_cuda_expandable_segments.py
test_memory_timeline_no_id (__main__.TestMemoryProfilerE2E) from profiler/test_memory_profiler.py
test_inverse_errors_large_cuda_float64 (__main__.TestLinalgCUDA) from test_linalg.py
test_trace_dependencies (__main__.TestAnalyze) from test_package.py
test_caching_pinned_memory (__main__.TestCuda) from test_cuda_expandable_segments.py
test_graph_concurrent_replay (__main__.TestCuda) from test_cuda_expandable_segments.py
test_module_attribute_mutation_violation_negative_1 (__main__.MutationExportTests) from dynamo/test_export_mutations.py
test_module_attribute_mutation_violation_negative_2 (__main__.MutationExportTests) from dynamo/test_export_mutations.py
test_module_attribute_mutation_violation_negative_4 (__main__.MutationExportTests) from dynamo/test_export_mutations.py
test_vmapjvpall_linalg_lu_cuda_float32 (__main__.TestOperatorsCUDA) from functorch/test_ops.py
test_vmapjvpvjp_linalg_lu_cuda_float32 (__main__.TestOperatorsCUDA) from functorch/test_ops.py
test_Conv2d_no_bias_cuda_tf32 (__main__.TestNN) from test_nn.py
test_save_graph_repro (__main__.TestAfterAot) from dynamo/test_after_aot.py
test_doc_examples (__main__.TestTypeHints) from test_type_hints.py
test_caching_pinned_memory (__main__.TestCuda) from test_cuda.py
test_graph_concurrent_replay (__main__.TestCuda) from test_cuda.py
test_non_contiguous_tensors_nn_ConvTranspose1d_cuda_complex32 (__main__.TestModuleCUDA) from test_modules.py
test_pickle_nn_RNN_eval_mode_cuda_float64 (__main__.TestModuleCUDA) from test_modules.py
test_op_has_batch_rule_nn_functional_conv_transpose3d_cuda_float32 (__main__.TestVmapOperatorsOpInfoCUDA) from functorch/test_vmap.py
test_geometric_kstest_cuda_float32 (__main__.TestTorchDeviceTypeCUDA) from test_torch.py
test_profiler_experimental_tree_with_memory (__main__.TestProfilerTree) from profiler/test_profiler_tree.py
test_fs_pool (__main__.TestMultiprocessing) from test_multiprocessing.py
test_forward_mode_AD_linalg_lu_factor_ex_cuda_complex128 (__main__.TestFwdGradientsCUDA) from test_ops_fwd_gradients.py
test_vjp_linalg_lu_cuda_float32 (__main__.TestOperatorsCUDA) from functorch/test_ops.py
test_inplace_grad_fmod_cuda_float64 (__main__.TestBwdGradientsCUDA) from test_ops_gradients.py
test_inplace_gradgrad_remainder_cuda_float64 (__main__.TestBwdGradientsCUDA) from test_ops_gradients.py
test_bottleneck_cuda (__main__.TestBottleneck) from test_utils.py
test_comprehensive_empty_strided_cuda_int32 (__main__.TestInductorOpInfoCUDA) from inductor/test_torchinductor_opinfo.py
test_vmapvjpvjp_linalg_lu_cuda_float32 (__main__.TestOperatorsCUDA) from functorch/test_ops.py
The following 11 are still flaky:
test_transpose_with_norm (__main__.CPUReproTests) from inductor/test_cpu_repro.py, failing 215/215
test_compare_cpu_linalg_pinv_singular_cuda_float32 (__main__.TestCommonCUDA) from test_ops.py, failing 100/100
test_conv_bn_fuse_dynamic_shapes_cpu (__main__.DynamicShapesCodegenCpuTests) from inductor/test_torchinductor_codegen_dynamic_shapes.py, failing 115/115
test_lobpcg (__main__.TestAutograd) from test_autograd.py, failing 50/50
test_module_attribute_mutation_violation_negative_3 (__main__.MutationExportTests) from dynamo/test_export_mutations.py, failing 2/50
test_Conv2d_dilated_cuda_tf32 (__main__.TestNN) from test_nn.py, failing 1/50
test_grad_nn_GroupNorm_cuda_float64 (__main__.TestModuleCUDA) from test_modules.py, failing 50/50
test_index_add_correctness (__main__.TestTorch) from test_torch.py, failing 22/50
test_attn_cuda (__main__.TestMin) from functorch/test_dims.py, failing 1/50
test_open_device_registration (__main__.TestCppExtensionOpenRgistration) from test_cpp_extensions_open_device_registration.py, failing 50/50
test_gradgrad_nn_GroupNorm_cuda_float64 (__main__.TestModuleCUDA) from test_modules.py, failing 50/50
```
* Uploading tests stats for rerunning disabled tests takes only half a minute
```
time python3 -m tools.stats.upload_test_stats --workflow-run-id 5229037746 --workflow-run-attempt 1 --head-branch main
31.94s user 2.94s system 44% cpu 1:19.07 total
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/103476
Approved by: https://github.com/clee2000
2023-06-13 17:07:40 +00:00
|
|
|
return all(
|
|
|
|
|
t.get("num_green", 0) + t.get("num_red", 0) > MAX_RETRY_IN_NON_DISABLED_MODE
|
|
|
|
|
for t in tests.values()
|
|
|
|
|
)
|
2024-04-22 20:19:35 +00:00
|
|
|
|
|
|
|
|
|
2024-06-29 04:48:06 +00:00
|
|
|
def get_job_id(report: Path) -> int | None:
|
2024-04-22 20:19:35 +00:00
|
|
|
# [Job id in artifacts]
|
|
|
|
|
# Retrieve the job id from the report path. In our GHA workflows, we append
|
|
|
|
|
# the job id to the end of the report name, so `report` looks like:
|
|
|
|
|
# unzipped-test-reports-foo_5596745227/test/test-reports/foo/TEST-foo.xml
|
|
|
|
|
# and we want to get `5596745227` out of it.
|
|
|
|
|
try:
|
|
|
|
|
return int(report.parts[0].rpartition("_")[2])
|
|
|
|
|
except ValueError:
|
|
|
|
|
return None
|