From 86f038dd56dab9ecec5893b60efc74e46ca19e36 Mon Sep 17 00:00:00 2001
From: Catherine Lee <csl@fb.com>
Date: Thu, 28 Jul 2022 16:35:01 +0000
Subject: [PATCH] download test times during build to avoid race conditions
 (#81915)

After https://github.com/pytorch/pytorch/pull/81116, we started pulling test times straight from the source instead of first downloading them in the build job and then having the test job take the build jobs version.  This can cause an issues where different shards pull different versions of the file, leading to incorrect sharding (ex two shards running the same tests file on accident).  This generally happens if the test jobs happen while the test times file is being updated (unlikely, but not impossible) or if someone reruns a test job the next day.

In this PR, I return to the old method of downloading the test times file during the build job and having the test jobs pull from the build jobs uploaded artifacts.  If there is no test times file in the build job's artifacts, we fall back to the default sharding plan.

Notes:
* script moved to a new file to avoid needing to import torch, which would require torch to be built, which can cause issues with asan
* I got errors with asan (`ASan runtime does not come first in initial library list; you should either link runtime to your application or manually preload it with LD_PRELOAD.`), so I put the script at the beginning of the build

### Test Plan
Verified that the number of tests ran in the pull and trunk workflows are similar to workflows run on master.  Checked logs to see if artifacts were being used for sharding.  Spot checked a few test configs to check that their lists of selected tests didn't overlap.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/81915
Approved by: https://github.com/huydhn
---
 .github/workflows/_linux-build.yml            |  2 +-
 .github/workflows/_mac-build.yml              |  2 +-
 .jenkins/pytorch/build-asan.sh                |  2 ++
 .jenkins/pytorch/build.sh                     |  6 +++++
 .jenkins/pytorch/macos-build.sh               |  2 ++
 .../win-test-helpers/build_pytorch.bat        |  4 +++
 .../test_python_jit_legacy.bat                |  1 +
 .../win-test-helpers/test_python_shard.bat    |  3 +++
 test/run_test.py                              | 27 ++++++++++++-------
 tools/stats/export_test_times.py              | 17 ++++++++++++
 tools/stats/import_test_stats.py              |  5 ++--
 11 files changed, 56 insertions(+), 15 deletions(-)
 create mode 100644 tools/stats/export_test_times.py

diff --git a/.github/workflows/_linux-build.yml b/.github/workflows/_linux-build.yml
index a109e4893bd..09a400c4d50 100644
--- a/.github/workflows/_linux-build.yml
+++ b/.github/workflows/_linux-build.yml
@@ -137,7 +137,7 @@ jobs:
       - name: Archive artifacts into zip
         if: inputs.build-generates-artifacts
         run: |
-          zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin
+          zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .pytorch-test-times.json
 
       - name: Store PyTorch Build Artifacts on S3
         uses: seemethere/upload-artifact-s3@v5
diff --git a/.github/workflows/_mac-build.yml b/.github/workflows/_mac-build.yml
index 0d89a6ae897..f17bd649c71 100644
--- a/.github/workflows/_mac-build.yml
+++ b/.github/workflows/_mac-build.yml
@@ -102,7 +102,7 @@ jobs:
       - name: Archive artifacts into zip
         if: inputs.build-generates-artifacts
         run: |
-          zip -1 -r artifacts.zip dist/ build/.ninja_log build/compile_commands.json
+          zip -1 -r artifacts.zip dist/ build/.ninja_log build/compile_commands.json .pytorch-test-times.json
 
       - name: Store PyTorch Build Artifacts on GHA
         uses: actions/upload-artifact@v2
diff --git a/.jenkins/pytorch/build-asan.sh b/.jenkins/pytorch/build-asan.sh
index c07c4ddb865..d46f4bd2a68 100755
--- a/.jenkins/pytorch/build-asan.sh
+++ b/.jenkins/pytorch/build-asan.sh
@@ -12,6 +12,8 @@ source "$(dirname "${BASH_SOURCE[0]}")/common-build.sh"
 echo "Clang version:"
 clang --version
 
+python tools/stats/export_test_times.py
+
 # detect_leaks=0: Python is very leaky, so we need suppress it
 # symbolize=1: Gives us much better errors when things go wrong
 export ASAN_OPTIONS=detect_leaks=0:detect_stack_use_after_return=1:symbolize=1:detect_odr_violation=0
diff --git a/.jenkins/pytorch/build.sh b/.jenkins/pytorch/build.sh
index 766c97855a9..d442a4ebd41 100755
--- a/.jenkins/pytorch/build.sh
+++ b/.jenkins/pytorch/build.sh
@@ -296,4 +296,10 @@ else
   fi
 fi
 
+if [[ "$BUILD_ENVIRONMENT" != *libtorch* && "$BUILD_ENVIRONMENT" != *bazel* ]]; then
+  # export test times so that potential sharded tests that'll branch off this build will use consistent data
+  # don't do this for libtorch as libtorch is C++ only and thus won't have python tests run on its build
+  python tools/stats/export_test_times.py
+fi
+
 print_sccache_stats
diff --git a/.jenkins/pytorch/macos-build.sh b/.jenkins/pytorch/macos-build.sh
index 5105e6d89b1..db33e2dedf9 100755
--- a/.jenkins/pytorch/macos-build.sh
+++ b/.jenkins/pytorch/macos-build.sh
@@ -73,4 +73,6 @@ if which sccache > /dev/null; then
   print_sccache_stats
 fi
 
+python tools/stats/export_test_times.py
+
 assert_git_not_dirty
diff --git a/.jenkins/pytorch/win-test-helpers/build_pytorch.bat b/.jenkins/pytorch/win-test-helpers/build_pytorch.bat
index eff4f5e5d16..b954430734b 100644
--- a/.jenkins/pytorch/win-test-helpers/build_pytorch.bat
+++ b/.jenkins/pytorch/win-test-helpers/build_pytorch.bat
@@ -146,6 +146,10 @@ python setup.py install --cmake && sccache --show-stats && (
     if errorlevel 1 exit /b
     if not errorlevel 0 exit /b
 
+    :: export test times so that potential sharded tests that'll branch off this build will use consistent data
+    python tools/stats/export_test_times.py
+    copy /Y ".pytorch-test-times.json" "%PYTORCH_FINAL_PACKAGE_DIR%"
+
     :: Also save build/.ninja_log as an artifact
     copy /Y "build\.ninja_log" "%PYTORCH_FINAL_PACKAGE_DIR%\"
   )
diff --git a/.jenkins/pytorch/win-test-helpers/test_python_jit_legacy.bat b/.jenkins/pytorch/win-test-helpers/test_python_jit_legacy.bat
index 220cff1ff50..c18151d65c0 100644
--- a/.jenkins/pytorch/win-test-helpers/test_python_jit_legacy.bat
+++ b/.jenkins/pytorch/win-test-helpers/test_python_jit_legacy.bat
@@ -1,6 +1,7 @@
 call %SCRIPT_HELPERS_DIR%\setup_pytorch_env.bat
 
 echo Copying over test times file
+copy /Y "%PYTORCH_FINAL_PACKAGE_DIR_WIN%\.pytorch-test-times.json" "%PROJECT_DIR_WIN%"
 
 pushd test
 
diff --git a/.jenkins/pytorch/win-test-helpers/test_python_shard.bat b/.jenkins/pytorch/win-test-helpers/test_python_shard.bat
index e2bcc05efaa..5313bc0078d 100644
--- a/.jenkins/pytorch/win-test-helpers/test_python_shard.bat
+++ b/.jenkins/pytorch/win-test-helpers/test_python_shard.bat
@@ -21,6 +21,9 @@ if "%SHARD_NUMBER%" == "1" (
   )
 )
 
+echo Copying over test times file
+copy /Y "%PYTORCH_FINAL_PACKAGE_DIR_WIN%\.pytorch-test-times.json" "%PROJECT_DIR_WIN%"
+
 echo Run nn tests
 python run_test.py --exclude-jit-executor --exclude-distributed-tests --shard "%SHARD_NUMBER%" "%NUM_TEST_SHARDS%" --verbose
 if ERRORLEVEL 1 goto fail
diff --git a/test/run_test.py b/test/run_test.py
index b90f17e5301..fd7587c5683 100644
--- a/test/run_test.py
+++ b/test/run_test.py
@@ -13,6 +13,8 @@ import signal
 import subprocess
 import sys
 import tempfile
+import json
+from typing import Dict, Optional, List, cast, Any
 
 import torch
 from torch.utils import cpp_extension
@@ -25,14 +27,13 @@ from torch.testing._internal.common_utils import (
     parser as common_parser,
 )
 import torch.distributed as dist
-from typing import Optional, List
 
 REPO_ROOT = pathlib.Path(__file__).resolve().parent.parent
 
 try:
     # using tools/ to optimize test run.
     sys.path.append(str(REPO_ROOT))
-    from tools.stats.import_test_stats import get_test_times
+    from tools.stats.export_test_times import TEST_TIMES_FILE
     from tools.testing.test_selections import (
         get_reordered_tests,
         get_test_case_configs,
@@ -72,6 +73,7 @@ def discover_tests(
         rc += extra_tests
     return sorted(rc)
 
+
 TESTS = discover_tests(
     blocklisted_patterns=[
         'ao',
@@ -268,9 +270,6 @@ CORE_TEST_LIST = [
     "test_torch"
 ]
 
-# the JSON file to store the S3 test stats
-TEST_TIMES_FILE = ".pytorch-test-times.json"
-
 # if a test file takes longer than 5 min, we add it to TARGET_DET_LIST
 SLOW_TEST_THRESHOLD = 300
 
@@ -395,6 +394,7 @@ def test_cuda_primary_ctx(test_module, test_directory, options):
         test_module, test_directory, options, extra_unittest_args=["--subprocess"]
     )
 
+
 run_test_with_subprocess = functools.partial(run_test, extra_unittest_args=["--subprocess"])
 
 
@@ -402,7 +402,6 @@ def get_run_test_with_subprocess_fn():
     return lambda test_module, test_directory, options: run_test_with_subprocess(test_module, test_directory, options)
 
 
-
 def _test_cpp_extensions_aot(test_directory, options, use_ninja):
     if use_ninja:
         try:
@@ -570,6 +569,7 @@ CUSTOM_HANDLERS = {
     "distributed/rpc/cuda/test_tensorpipe_agent": get_run_test_with_subprocess_fn(),
 }
 
+
 def parse_test_module(test):
     return test.split(".")[0]
 
@@ -862,14 +862,21 @@ def get_selected_tests(options):
             return selected_tests
 
         # Download previous test times to make sharding decisions
-        test_file_times = get_test_times(str(REPO_ROOT), filename=TEST_TIMES_FILE)
-        if len(test_file_times) == 0:
+        path = os.path.join(str(REPO_ROOT), TEST_TIMES_FILE)
+        if os.path.exists(path):
+            with open(path, "r") as f:
+                test_file_times = cast(Dict[str, Any], json.load(f))
+        else:
+            test_file_times = {}
+        if os.environ["TEST_CONFIG"] not in test_file_times:
             print(
-                "::warning:: Gathered no stats from S3. Proceeding with default sharding plan."
+                "::warning:: Gathered no stats from artifacts. Proceeding with default sharding plan."
             )
             selected_tests = selected_tests[which_shard - 1 :: num_shards]
         else:
-            shards = calculate_shards(num_shards, selected_tests, test_file_times)
+            print("Found test time stats from artifacts")
+            test_file_times_config = test_file_times[os.environ["TEST_CONFIG"]]
+            shards = calculate_shards(num_shards, selected_tests, test_file_times_config)
             _, tests_from_shard = shards[which_shard - 1]
             selected_tests = tests_from_shard
 
diff --git a/tools/stats/export_test_times.py b/tools/stats/export_test_times.py
new file mode 100644
index 00000000000..4554f546ee0
--- /dev/null
+++ b/tools/stats/export_test_times.py
@@ -0,0 +1,17 @@
+import pathlib
+import sys
+
+REPO_ROOT = pathlib.Path(__file__).resolve().parent.parent.parent
+sys.path.append(str(REPO_ROOT))
+from tools.stats.import_test_stats import get_test_times
+
+TEST_TIMES_FILE = ".pytorch-test-times.json"
+
+
+def main() -> None:
+    print(f"Exporting test times from test-infra to {TEST_TIMES_FILE}")
+    get_test_times(str(REPO_ROOT), filename=TEST_TIMES_FILE)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/stats/import_test_stats.py b/tools/stats/import_test_stats.py
index 0203f405a41..fbc33a685d4 100644
--- a/tools/stats/import_test_stats.py
+++ b/tools/stats/import_test_stats.py
@@ -81,13 +81,12 @@ def get_slow_tests(
         return {}
 
 
-def get_test_times(dirpath: str, filename: str) -> Dict[str, float]:
+def get_test_times(dirpath: str, filename: str) -> Dict[str, Dict[str, float]]:
     url = "https://raw.githubusercontent.com/pytorch/test-infra/generated-stats/stats/test-times.json"
 
     def process_response(the_response: Dict[str, Any]) -> Any:
         build_environment = os.environ["BUILD_ENVIRONMENT"]
-        test_config = os.environ["TEST_CONFIG"]
-        return the_response[build_environment][test_config]
+        return the_response[build_environment]
 
     try:
         return fetch_and_cache(dirpath, filename, url, process_response)