diff --git a/.github/workflows/_linux-build.yml b/.github/workflows/_linux-build.yml index a109e4893bd..09a400c4d50 100644 --- a/.github/workflows/_linux-build.yml +++ b/.github/workflows/_linux-build.yml @@ -137,7 +137,7 @@ jobs: - name: Archive artifacts into zip if: inputs.build-generates-artifacts run: | - zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin + zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .pytorch-test-times.json - name: Store PyTorch Build Artifacts on S3 uses: seemethere/upload-artifact-s3@v5 diff --git a/.github/workflows/_mac-build.yml b/.github/workflows/_mac-build.yml index 0d89a6ae897..f17bd649c71 100644 --- a/.github/workflows/_mac-build.yml +++ b/.github/workflows/_mac-build.yml @@ -102,7 +102,7 @@ jobs: - name: Archive artifacts into zip if: inputs.build-generates-artifacts run: | - zip -1 -r artifacts.zip dist/ build/.ninja_log build/compile_commands.json + zip -1 -r artifacts.zip dist/ build/.ninja_log build/compile_commands.json .pytorch-test-times.json - name: Store PyTorch Build Artifacts on GHA uses: actions/upload-artifact@v2 diff --git a/.jenkins/pytorch/build-asan.sh b/.jenkins/pytorch/build-asan.sh index c07c4ddb865..d46f4bd2a68 100755 --- a/.jenkins/pytorch/build-asan.sh +++ b/.jenkins/pytorch/build-asan.sh @@ -12,6 +12,8 @@ source "$(dirname "${BASH_SOURCE[0]}")/common-build.sh" echo "Clang version:" clang --version +python tools/stats/export_test_times.py + # detect_leaks=0: Python is very leaky, so we need suppress it # symbolize=1: Gives us much better errors when things go wrong export ASAN_OPTIONS=detect_leaks=0:detect_stack_use_after_return=1:symbolize=1:detect_odr_violation=0 diff --git a/.jenkins/pytorch/build.sh b/.jenkins/pytorch/build.sh index 766c97855a9..d442a4ebd41 100755 --- a/.jenkins/pytorch/build.sh +++ b/.jenkins/pytorch/build.sh @@ -296,4 +296,10 @@ else fi fi +if [[ "$BUILD_ENVIRONMENT" != *libtorch* && "$BUILD_ENVIRONMENT" != *bazel* ]]; then + # export test times so that potential sharded tests that'll branch off this build will use consistent data + # don't do this for libtorch as libtorch is C++ only and thus won't have python tests run on its build + python tools/stats/export_test_times.py +fi + print_sccache_stats diff --git a/.jenkins/pytorch/macos-build.sh b/.jenkins/pytorch/macos-build.sh index 5105e6d89b1..db33e2dedf9 100755 --- a/.jenkins/pytorch/macos-build.sh +++ b/.jenkins/pytorch/macos-build.sh @@ -73,4 +73,6 @@ if which sccache > /dev/null; then print_sccache_stats fi +python tools/stats/export_test_times.py + assert_git_not_dirty diff --git a/.jenkins/pytorch/win-test-helpers/build_pytorch.bat b/.jenkins/pytorch/win-test-helpers/build_pytorch.bat index eff4f5e5d16..b954430734b 100644 --- a/.jenkins/pytorch/win-test-helpers/build_pytorch.bat +++ b/.jenkins/pytorch/win-test-helpers/build_pytorch.bat @@ -146,6 +146,10 @@ python setup.py install --cmake && sccache --show-stats && ( if errorlevel 1 exit /b if not errorlevel 0 exit /b + :: export test times so that potential sharded tests that'll branch off this build will use consistent data + python tools/stats/export_test_times.py + copy /Y ".pytorch-test-times.json" "%PYTORCH_FINAL_PACKAGE_DIR%" + :: Also save build/.ninja_log as an artifact copy /Y "build\.ninja_log" "%PYTORCH_FINAL_PACKAGE_DIR%\" ) diff --git a/.jenkins/pytorch/win-test-helpers/test_python_jit_legacy.bat b/.jenkins/pytorch/win-test-helpers/test_python_jit_legacy.bat index 220cff1ff50..c18151d65c0 100644 --- a/.jenkins/pytorch/win-test-helpers/test_python_jit_legacy.bat +++ b/.jenkins/pytorch/win-test-helpers/test_python_jit_legacy.bat @@ -1,6 +1,7 @@ call %SCRIPT_HELPERS_DIR%\setup_pytorch_env.bat echo Copying over test times file +copy /Y "%PYTORCH_FINAL_PACKAGE_DIR_WIN%\.pytorch-test-times.json" "%PROJECT_DIR_WIN%" pushd test diff --git a/.jenkins/pytorch/win-test-helpers/test_python_shard.bat b/.jenkins/pytorch/win-test-helpers/test_python_shard.bat index e2bcc05efaa..5313bc0078d 100644 --- a/.jenkins/pytorch/win-test-helpers/test_python_shard.bat +++ b/.jenkins/pytorch/win-test-helpers/test_python_shard.bat @@ -21,6 +21,9 @@ if "%SHARD_NUMBER%" == "1" ( ) ) +echo Copying over test times file +copy /Y "%PYTORCH_FINAL_PACKAGE_DIR_WIN%\.pytorch-test-times.json" "%PROJECT_DIR_WIN%" + echo Run nn tests python run_test.py --exclude-jit-executor --exclude-distributed-tests --shard "%SHARD_NUMBER%" "%NUM_TEST_SHARDS%" --verbose if ERRORLEVEL 1 goto fail diff --git a/test/run_test.py b/test/run_test.py index b90f17e5301..fd7587c5683 100644 --- a/test/run_test.py +++ b/test/run_test.py @@ -13,6 +13,8 @@ import signal import subprocess import sys import tempfile +import json +from typing import Dict, Optional, List, cast, Any import torch from torch.utils import cpp_extension @@ -25,14 +27,13 @@ from torch.testing._internal.common_utils import ( parser as common_parser, ) import torch.distributed as dist -from typing import Optional, List REPO_ROOT = pathlib.Path(__file__).resolve().parent.parent try: # using tools/ to optimize test run. sys.path.append(str(REPO_ROOT)) - from tools.stats.import_test_stats import get_test_times + from tools.stats.export_test_times import TEST_TIMES_FILE from tools.testing.test_selections import ( get_reordered_tests, get_test_case_configs, @@ -72,6 +73,7 @@ def discover_tests( rc += extra_tests return sorted(rc) + TESTS = discover_tests( blocklisted_patterns=[ 'ao', @@ -268,9 +270,6 @@ CORE_TEST_LIST = [ "test_torch" ] -# the JSON file to store the S3 test stats -TEST_TIMES_FILE = ".pytorch-test-times.json" - # if a test file takes longer than 5 min, we add it to TARGET_DET_LIST SLOW_TEST_THRESHOLD = 300 @@ -395,6 +394,7 @@ def test_cuda_primary_ctx(test_module, test_directory, options): test_module, test_directory, options, extra_unittest_args=["--subprocess"] ) + run_test_with_subprocess = functools.partial(run_test, extra_unittest_args=["--subprocess"]) @@ -402,7 +402,6 @@ def get_run_test_with_subprocess_fn(): return lambda test_module, test_directory, options: run_test_with_subprocess(test_module, test_directory, options) - def _test_cpp_extensions_aot(test_directory, options, use_ninja): if use_ninja: try: @@ -570,6 +569,7 @@ CUSTOM_HANDLERS = { "distributed/rpc/cuda/test_tensorpipe_agent": get_run_test_with_subprocess_fn(), } + def parse_test_module(test): return test.split(".")[0] @@ -862,14 +862,21 @@ def get_selected_tests(options): return selected_tests # Download previous test times to make sharding decisions - test_file_times = get_test_times(str(REPO_ROOT), filename=TEST_TIMES_FILE) - if len(test_file_times) == 0: + path = os.path.join(str(REPO_ROOT), TEST_TIMES_FILE) + if os.path.exists(path): + with open(path, "r") as f: + test_file_times = cast(Dict[str, Any], json.load(f)) + else: + test_file_times = {} + if os.environ["TEST_CONFIG"] not in test_file_times: print( - "::warning:: Gathered no stats from S3. Proceeding with default sharding plan." + "::warning:: Gathered no stats from artifacts. Proceeding with default sharding plan." ) selected_tests = selected_tests[which_shard - 1 :: num_shards] else: - shards = calculate_shards(num_shards, selected_tests, test_file_times) + print("Found test time stats from artifacts") + test_file_times_config = test_file_times[os.environ["TEST_CONFIG"]] + shards = calculate_shards(num_shards, selected_tests, test_file_times_config) _, tests_from_shard = shards[which_shard - 1] selected_tests = tests_from_shard diff --git a/tools/stats/export_test_times.py b/tools/stats/export_test_times.py new file mode 100644 index 00000000000..4554f546ee0 --- /dev/null +++ b/tools/stats/export_test_times.py @@ -0,0 +1,17 @@ +import pathlib +import sys + +REPO_ROOT = pathlib.Path(__file__).resolve().parent.parent.parent +sys.path.append(str(REPO_ROOT)) +from tools.stats.import_test_stats import get_test_times + +TEST_TIMES_FILE = ".pytorch-test-times.json" + + +def main() -> None: + print(f"Exporting test times from test-infra to {TEST_TIMES_FILE}") + get_test_times(str(REPO_ROOT), filename=TEST_TIMES_FILE) + + +if __name__ == "__main__": + main() diff --git a/tools/stats/import_test_stats.py b/tools/stats/import_test_stats.py index 0203f405a41..fbc33a685d4 100644 --- a/tools/stats/import_test_stats.py +++ b/tools/stats/import_test_stats.py @@ -81,13 +81,12 @@ def get_slow_tests( return {} -def get_test_times(dirpath: str, filename: str) -> Dict[str, float]: +def get_test_times(dirpath: str, filename: str) -> Dict[str, Dict[str, float]]: url = "https://raw.githubusercontent.com/pytorch/test-infra/generated-stats/stats/test-times.json" def process_response(the_response: Dict[str, Any]) -> Any: build_environment = os.environ["BUILD_ENVIRONMENT"] - test_config = os.environ["TEST_CONFIG"] - return the_response[build_environment][test_config] + return the_response[build_environment] try: return fetch_and_cache(dirpath, filename, url, process_response)