From 40b0741e91ff5be22df1017d63a8c97dddcd819b Mon Sep 17 00:00:00 2001
From: pytorchbot <pytorchbot@pytorch.com>
Date: Mon, 30 Dec 2024 07:34:20 +0000
Subject: [PATCH] 2024-12-30 nightly release
 (2ed4d65af0a1993c0df7b081f4088d0f3614283e)

---
 .../normalize_yaml_fragment.py                |   2 +-
 .github/scripts/delete_old_branches.py        |   2 +-
 .github/scripts/ensure_actions_will_cancel.py |   2 +-
 .../scripts/generate_binary_build_matrix.py   |   2 +-
 .github/scripts/gitutils.py                   |   2 +-
 .github/scripts/lint_native_functions.py      |   2 +-
 .github/scripts/test_gitutils.py              |   2 +-
 .../ci_expected_accuracy/update_expected.py   |   2 +-
 docs/source/scripts/build_opsets.py           |   6 +-
 .../scripts/build_quantization_configs.py     |   2 +-
 .../scripts/exportdb/generate_example_rst.py  |   6 +-
 scripts/compile_tests/update_failures.py      |   4 +-
 test/jit/test_backend_nnapi.py                |   2 +-
 .../test_dynamo_with_onnxruntime_backend.py   |   4 +-
 test/onnx/onnx_test_common.py                 |   7 +-
 .../test_torch_export_with_onnxruntime.py     |   5 +-
 test/optim/test_optim.py                      | 292 +++++++++++++++++-
 test/package/test_save_load.py                |  17 +
 test/quantization/core/test_docs.py           |   2 +-
 test/test_typing.py                           |   2 +-
 third_party/xpu.txt                           |   2 +-
 tools/amd_build/build_amd.py                  |  18 +-
 tools/build_libtorch.py                       |   8 +-
 tools/code_coverage/package/oss/utils.py      |   4 +-
 tools/code_coverage/package/util/setting.py   |   5 +-
 tools/gen_vulkan_spv.py                       |  12 +-
 tools/linter/adapters/s3_init.py              |   5 +-
 tools/onnx/update_default_opset_version.py    |   2 +-
 tools/setup_helpers/cmake.py                  |   5 +-
 tools/setup_helpers/gen.py                    |   6 +-
 tools/setup_helpers/gen_unboxing.py           |   6 +-
 tools/setup_helpers/generate_code.py          |   6 +-
 tools/stats/export_test_times.py              |   3 +-
 tools/stats/import_test_stats.py              |   2 +-
 tools/test/heuristics/test_heuristics.py      |   2 +-
 tools/test/heuristics/test_interface.py       |   2 +-
 tools/test/heuristics/test_utils.py           |   2 +-
 tools/test/test_gen_backend_stubs.py          |   5 -
 tools/test/test_test_run.py                   |   2 +-
 tools/test/test_test_selections.py            |   2 +-
 tools/test/test_upload_stats_lib.py           |   2 +-
 tools/testing/discover_tests.py               |   2 +-
 .../testing/do_target_determination_for_s3.py |   2 +-
 tools/testing/explicit_ci_jobs.py             |   2 +-
 tools/testing/modulefinder_determinator.py    |   2 +-
 .../target_determination/gen_artifact.py      |   2 +-
 .../heuristics/filepath.py                    |   2 +-
 .../target_determination/heuristics/llm.py    |   2 +-
 .../heuristics/previously_failed_in_pr.py     |   2 +-
 .../target_determination/heuristics/utils.py  |   3 +-
 tools/testing/test_selections.py              |   2 +-
 torch/_dynamo/external_utils.py               |  15 +-
 torch/_dynamo/testing.py                      |   7 +-
 torch/_inductor/runtime/compile_tasks.py      |  13 +-
 torch/_strobelight/cli_function_profiler.py   |  19 +-
 torch/_subclasses/fake_tensor.py              |   4 +-
 torch/distributed/_tools/fsdp2_mem_tracker.py |  44 ++-
 torch/onnx/_internal/_lazy_import.py          |   3 +-
 torch/optim/adam.py                           |  50 ++-
 torch/package/package_importer.py             |   5 +-
 torch/testing/_internal/common_utils.py       |   2 +-
 torch/utils/_stats.py                         |  12 +-
 .../_strobelight/cli_function_profiler.py     |  19 +-
 torch/utils/_sympy/functions.py               |   9 +-
 .../mixed_mm/gen_data_mixed_mm.py             |   9 +-
 .../_autoheuristic/mixed_mm/test_mixed_mm.py  |   8 +-
 .../mixed_mm/train_decision_mixedmm.py        |   4 +-
 torchgen/_autoheuristic/mm/gen_data_mm.py     |   9 +-
 .../_autoheuristic/mm/train_decision_mm.py    |   4 +-
 .../_autoheuristic/pad_mm/gen_data_pad_mm.py  |   9 +-
 torchgen/_autoheuristic/pad_mm/test_pad_mm.py |   8 +-
 .../pad_mm/train_decision_pad_mm.py           |   4 +-
 .../_autoheuristic/pad_mm/train_pad_mm.py     |   4 +-
 .../pad_mm/train_regression_pad_mm.py         |   4 +-
 torchgen/gen_backend_stubs.py                 |   2 +-
 torchgen/gen_lazy_tensor.py                   |   2 +-
 76 files changed, 569 insertions(+), 187 deletions(-)

diff --git a/.circleci/codegen_validation/normalize_yaml_fragment.py b/.circleci/codegen_validation/normalize_yaml_fragment.py
index 6d15f1a5a5b..232eaa833b9 100755
--- a/.circleci/codegen_validation/normalize_yaml_fragment.py
+++ b/.circleci/codegen_validation/normalize_yaml_fragment.py
@@ -7,7 +7,7 @@ import yaml
 
 
 # Need to import modules that lie on an upward-relative path
-sys.path.append(os.path.join(sys.path[0], ".."))
+sys.path.append(os.path.dirname(sys.path[0]))
 
 import cimodel.lib.miniyaml as miniyaml
 
diff --git a/.github/scripts/delete_old_branches.py b/.github/scripts/delete_old_branches.py
index 9ca82eb7139..e28d33c642b 100644
--- a/.github/scripts/delete_old_branches.py
+++ b/.github/scripts/delete_old_branches.py
@@ -22,7 +22,7 @@ TOKEN = os.environ["GITHUB_TOKEN"]
 if not TOKEN:
     raise Exception("GITHUB_TOKEN is not set")  # noqa: TRY002
 
-REPO_ROOT = Path(__file__).parent.parent.parent
+REPO_ROOT = Path(__file__).parents[2]
 
 # Query for all PRs instead of just closed/merged because it's faster
 GRAPHQL_ALL_PRS_BY_UPDATED_AT = """
diff --git a/.github/scripts/ensure_actions_will_cancel.py b/.github/scripts/ensure_actions_will_cancel.py
index 9e464f0dc25..2c76f09bb67 100755
--- a/.github/scripts/ensure_actions_will_cancel.py
+++ b/.github/scripts/ensure_actions_will_cancel.py
@@ -6,7 +6,7 @@ from pathlib import Path
 import yaml
 
 
-REPO_ROOT = Path(__file__).resolve().parent.parent.parent
+REPO_ROOT = Path(__file__).resolve().parents[2]
 WORKFLOWS = REPO_ROOT / ".github" / "workflows"
 EXPECTED_GROUP_PREFIX = (
     "${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}"
diff --git a/.github/scripts/generate_binary_build_matrix.py b/.github/scripts/generate_binary_build_matrix.py
index eee9c6581de..5295e31a829 100644
--- a/.github/scripts/generate_binary_build_matrix.py
+++ b/.github/scripts/generate_binary_build_matrix.py
@@ -94,7 +94,7 @@ def get_nccl_submodule_version() -> str:
     from pathlib import Path
 
     nccl_version_mk = (
-        Path(__file__).absolute().parent.parent.parent
+        Path(__file__).absolute().parents[2]
         / "third_party"
         / "nccl"
         / "nccl"
diff --git a/.github/scripts/gitutils.py b/.github/scripts/gitutils.py
index 505ba268001..42f16366032 100644
--- a/.github/scripts/gitutils.py
+++ b/.github/scripts/gitutils.py
@@ -32,7 +32,7 @@ def get_git_remote_name() -> str:
 def get_git_repo_dir() -> str:
     from pathlib import Path
 
-    return os.getenv("GIT_REPO_DIR", str(Path(__file__).resolve().parent.parent.parent))
+    return os.getenv("GIT_REPO_DIR", str(Path(__file__).resolve().parents[2]))
 
 
 def fuzzy_list_to_dict(items: List[Tuple[str, str]]) -> Dict[str, List[str]]:
diff --git a/.github/scripts/lint_native_functions.py b/.github/scripts/lint_native_functions.py
index 4dfe9fd63e2..07504d7bdf2 100755
--- a/.github/scripts/lint_native_functions.py
+++ b/.github/scripts/lint_native_functions.py
@@ -26,7 +26,7 @@ def fn(base: str) -> str:
     return str(base / Path("aten/src/ATen/native/native_functions.yaml"))
 
 
-with open(Path(__file__).parent.parent.parent / fn(".")) as f:
+with open(Path(__file__).parents[2] / fn(".")) as f:
     contents = f.read()
 
 yaml = ruamel.yaml.YAML()  # type: ignore[attr-defined]
diff --git a/.github/scripts/test_gitutils.py b/.github/scripts/test_gitutils.py
index c4137bad31e..b269cac3bc5 100644
--- a/.github/scripts/test_gitutils.py
+++ b/.github/scripts/test_gitutils.py
@@ -68,7 +68,7 @@ class TestRetriesDecorator(TestCase):
 
 class TestGitRepo(TestCase):
     def setUp(self) -> None:
-        repo_dir = BASE_DIR.parent.parent.absolute()
+        repo_dir = BASE_DIR.absolute().parent.parent
         if not (repo_dir / ".git").is_dir():
             raise SkipTest(
                 "Can't find git directory, make sure to run this test on real repo checkout"
diff --git a/benchmarks/dynamo/ci_expected_accuracy/update_expected.py b/benchmarks/dynamo/ci_expected_accuracy/update_expected.py
index 289f96b90e6..d767b0c2609 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/update_expected.py
+++ b/benchmarks/dynamo/ci_expected_accuracy/update_expected.py
@@ -71,7 +71,7 @@ ARTIFACTS_QUERY_URL = (
     "c1cdfadc-6bb2-4a91-bbf9-3d19e1981cd4/run?format=JSON"
 )
 CSV_LINTER = str(
-    Path(__file__).absolute().parent.parent.parent.parent
+    Path(__file__).absolute().parents[3]
     / "tools/linter/adapters/no_merge_conflict_csv_linter.py"
 )
 
diff --git a/docs/source/scripts/build_opsets.py b/docs/source/scripts/build_opsets.py
index 84bc97b76d4..c752ade4d83 100644
--- a/docs/source/scripts/build_opsets.py
+++ b/docs/source/scripts/build_opsets.py
@@ -7,9 +7,9 @@ import torch._prims as prims
 from torchgen.gen import parse_native_yaml
 
 
-ROOT = Path(__file__).absolute().parent.parent.parent.parent
-NATIVE_FUNCTION_YAML_PATH = ROOT / Path("aten/src/ATen/native/native_functions.yaml")
-TAGS_YAML_PATH = ROOT / Path("aten/src/ATen/native/tags.yaml")
+ROOT = Path(__file__).absolute().parents[3]
+NATIVE_FUNCTION_YAML_PATH = ROOT / "aten/src/ATen/native/native_functions.yaml"
+TAGS_YAML_PATH = ROOT / "aten/src/ATen/native/tags.yaml"
 
 BUILD_DIR = "build/ir"
 ATEN_OPS_CSV_FILE = "aten_ops.csv"
diff --git a/docs/source/scripts/build_quantization_configs.py b/docs/source/scripts/build_quantization_configs.py
index bf405662040..5d1f445ade9 100644
--- a/docs/source/scripts/build_quantization_configs.py
+++ b/docs/source/scripts/build_quantization_configs.py
@@ -15,7 +15,7 @@ from torch.ao.quantization.backend_config.utils import (
 
 # Create a directory for the images, if it doesn't exist
 QUANTIZATION_BACKEND_CONFIG_IMAGE_PATH = os.path.join(
-    os.path.realpath(os.path.join(__file__, "..")), "quantization_backend_configs"
+    os.path.realpath(os.path.dirname(__file__)), "quantization_backend_configs"
 )
 
 if not os.path.exists(QUANTIZATION_BACKEND_CONFIG_IMAGE_PATH):
diff --git a/docs/source/scripts/exportdb/generate_example_rst.py b/docs/source/scripts/exportdb/generate_example_rst.py
index 1910ea4856d..4b7803b494e 100644
--- a/docs/source/scripts/exportdb/generate_example_rst.py
+++ b/docs/source/scripts/exportdb/generate_example_rst.py
@@ -11,9 +11,9 @@ from torch.export import export
 
 
 PWD = Path(__file__).absolute().parent
-ROOT = Path(__file__).absolute().parent.parent.parent.parent
-SOURCE = ROOT / Path("source")
-EXPORTDB_SOURCE = SOURCE / Path("generated") / Path("exportdb")
+ROOT = Path(__file__).absolute().parents[3]
+SOURCE = ROOT / "source"
+EXPORTDB_SOURCE = SOURCE / "generated" / "exportdb"
 
 
 def generate_example_rst(example_case: ExportCase):
diff --git a/scripts/compile_tests/update_failures.py b/scripts/compile_tests/update_failures.py
index a56e30e9987..73fb354a8d1 100755
--- a/scripts/compile_tests/update_failures.py
+++ b/scripts/compile_tests/update_failures.py
@@ -194,7 +194,7 @@ if __name__ == "__main__":
         "filename",
         nargs="?",
         default=str(
-            Path(__file__).absolute().parent.parent.parent
+            Path(__file__).absolute().parents[2]
             / "torch/testing/_internal/dynamo_test_failures.py"
         ),
         help="Optional path to dynamo_test_failures.py",
@@ -203,7 +203,7 @@ if __name__ == "__main__":
     parser.add_argument(
         "test_dir",
         nargs="?",
-        default=str(Path(__file__).absolute().parent.parent.parent / "test"),
+        default=str(Path(__file__).absolute().parents[2] / "test"),
         help="Optional path to test folder",
     )
     parser.add_argument(
diff --git a/test/jit/test_backend_nnapi.py b/test/jit/test_backend_nnapi.py
index 47e7ab1dab4..9f477166502 100644
--- a/test/jit/test_backend_nnapi.py
+++ b/test/jit/test_backend_nnapi.py
@@ -41,7 +41,7 @@ Inherits most tests from TestNNAPI, which loads Android NNAPI models
 without the delegate API.
 """
 # First skip is needed for IS_WINDOWS or IS_MACOS to skip the tests.
-torch_root = Path(__file__).resolve().parent.parent.parent
+torch_root = Path(__file__).resolve().parents[2]
 lib_path = torch_root / "build" / "lib" / "libnnapi_backend.so"
 
 
diff --git a/test/onnx/dynamo/test_dynamo_with_onnxruntime_backend.py b/test/onnx/dynamo/test_dynamo_with_onnxruntime_backend.py
index 1d93fdf553d..e13d8def9fe 100644
--- a/test/onnx/dynamo/test_dynamo_with_onnxruntime_backend.py
+++ b/test/onnx/dynamo/test_dynamo_with_onnxruntime_backend.py
@@ -7,6 +7,7 @@ import dataclasses
 import os
 import sys
 import unittest
+from pathlib import Path
 from typing import Tuple
 
 import onnxruntime
@@ -24,7 +25,8 @@ from torch.testing._internal import common_utils
 from torch.testing._internal.common_utils import skipIfNNModuleInlined
 
 
-sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+sys.path.append(str(Path(__file__).absolute().parents[1]))
+
 import onnx_test_common
 
 
diff --git a/test/onnx/onnx_test_common.py b/test/onnx/onnx_test_common.py
index 46e0efff46b..69a9a3b4e55 100644
--- a/test/onnx/onnx_test_common.py
+++ b/test/onnx/onnx_test_common.py
@@ -45,8 +45,7 @@ _InputArgsType = Optional[
 _OutputsType = Sequence[_NumericType]
 
 onnx_model_dir = os.path.join(
-    os.path.dirname(os.path.realpath(__file__)),
-    os.pardir,
+    os.path.dirname(os.path.dirname(os.path.realpath(__file__))),
     "repos",
     "onnx",
     "onnx",
@@ -54,11 +53,7 @@ onnx_model_dir = os.path.join(
     "test",
     "data",
 )
-
-
 pytorch_converted_dir = os.path.join(onnx_model_dir, "pytorch-converted")
-
-
 pytorch_operator_dir = os.path.join(onnx_model_dir, "pytorch-operator")
 
 
diff --git a/test/onnx/torch_export/test_torch_export_with_onnxruntime.py b/test/onnx/torch_export/test_torch_export_with_onnxruntime.py
index df51b02584f..3f4deda7dcb 100644
--- a/test/onnx/torch_export/test_torch_export_with_onnxruntime.py
+++ b/test/onnx/torch_export/test_torch_export_with_onnxruntime.py
@@ -1,8 +1,8 @@
 # Owner(s): ["module: onnx"]
 from __future__ import annotations
 
-import os
 import sys
+from pathlib import Path
 
 import torch
 import torch.onnx
@@ -10,7 +10,8 @@ from torch.testing._internal import common_utils
 from torch.utils import _pytree as torch_pytree
 
 
-sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+sys.path.append(str(Path(__file__).absolute().parents[1]))
+
 import onnx_test_common
 
 
diff --git a/test/optim/test_optim.py b/test/optim/test_optim.py
index 0fd270338d4..00f5db1478c 100644
--- a/test/optim/test_optim.py
+++ b/test/optim/test_optim.py
@@ -76,12 +76,22 @@ def _multistep_backprop_diff_hyperparams_fn(
 
     # This copy is necessary so the update on line 78 doesn't overwrite the original kwargs values
     kwargs = kwargs.copy()
+
+    # Have to pass in beta1 and beta2 separately
+    # so they're passed in as Tensors (not a tuple) and recognized by gradcheck
+    if "beta1" in kwargs or "beta2" in kwargs:
+        # Prevent just one beta kwarg from being passed in
+        assert (
+            "beta1" in kwargs and "beta2" in kwargs
+        ), "Both betas should be defined in kwargs"
+        kwargs.update({"betas": (kwargs.pop("beta1"), kwargs.pop("beta2"))})
+
     kwargs.update(
         {k: v.clone() if isinstance(v, torch.Tensor) else v for k, v in kwargs.items()}
     )
     differentiable_kwargs = [
         v for v in kwargs.values() if isinstance(v, torch.Tensor) and v.requires_grad
-    ]
+    ] + (list(kwargs["betas"]) if "betas" in kwargs else [])
 
     criterion = nn.MSELoss()
 
@@ -104,6 +114,10 @@ def _multistep_backprop_diff_hyperparams_fn(
     meta_loss = loss
     meta_loss.backward(inputs=(*differentiable_kwargs,), create_graph=True)
 
+    # Extra check to make sure the test properly computed a gradient for all kwargs
+    for kwarg in differentiable_kwargs:
+        assert kwarg.grad is not None
+
     return (
         (meta_loss,)
         + tuple(
@@ -111,11 +125,7 @@ def _multistep_backprop_diff_hyperparams_fn(
             for v in optimizer.state[params].values()
             if isinstance(v, torch.Tensor) and v.requires_grad
         )
-        + tuple(
-            v
-            for v in kwargs.values()
-            if isinstance(v, torch.Tensor) and v.requires_grad
-        )
+        + tuple(differentiable_kwargs)
     )
 
 
@@ -404,6 +414,276 @@ class TestDifferentiableOptimizer(TestCase):
             ),
         )
 
+    def test_adam_differentiable_lr(self):
+        params = torch.rand(10, requires_grad=True, dtype=torch.float64)
+        grad = torch.rand_like(params, requires_grad=True, dtype=torch.float64)
+        lr = torch.tensor(0.001, requires_grad=True, dtype=torch.float64)
+
+        state = {}
+        state["step"] = torch.tensor(10.0, requires_grad=False, dtype=torch.float64)
+        state["exp_avg"] = torch.rand(10, requires_grad=True, dtype=torch.float64)
+        state["exp_avg_sq"] = torch.rand(10, requires_grad=True, dtype=torch.float64)
+        state["max_exp_avg_sq"] = torch.rand(
+            10, requires_grad=True, dtype=torch.float64
+        )
+        kwargs: dict[str, Any] = {"lr": lr, "differentiable": True}
+
+        gradcheck(
+            _multistep_backprop_diff_hyperparams_fn,
+            (
+                params,
+                grad,
+                state,
+                Adam,
+                kwargs,  # includes lr
+                *state.values(),
+                *kwargs.values(),
+            ),
+        )
+
+    def test_adam_differentiable_weight_decay(self):
+        params = torch.rand(10, requires_grad=True, dtype=torch.float64)
+        grad = torch.rand_like(params, requires_grad=True, dtype=torch.float64)
+        weight_decay = torch.tensor(0.999, requires_grad=True, dtype=torch.float64)
+
+        state = {}
+        state["step"] = torch.tensor(10.0, requires_grad=False, dtype=torch.float64)
+        state["exp_avg"] = torch.rand(10, requires_grad=True, dtype=torch.float64)
+        state["exp_avg_sq"] = torch.rand(10, requires_grad=True, dtype=torch.float64)
+        state["max_exp_avg_sq"] = torch.rand(
+            10, requires_grad=True, dtype=torch.float64
+        )
+        kwargs: dict[str, Any] = {"weight_decay": weight_decay, "differentiable": True}
+
+        gradcheck(
+            _multistep_backprop_diff_hyperparams_fn,
+            (
+                params,
+                grad,
+                state,
+                Adam,
+                kwargs,  # includes weight_decay
+                *state.values(),
+                *kwargs.values(),
+            ),
+        )
+
+    def test_adam_differentiable_betas(self):
+        params = torch.rand(10, requires_grad=True, dtype=torch.float64)
+        grad = torch.rand_like(params, requires_grad=True, dtype=torch.float64)
+
+        lr = torch.tensor([0.001], requires_grad=True, dtype=torch.float64)
+        betas = (
+            torch.tensor(0.9, requires_grad=True, dtype=torch.float64),
+            torch.tensor(0.999, requires_grad=True, dtype=torch.float64),
+        )
+        state = {}
+        state["step"] = torch.tensor(10.0, requires_grad=False, dtype=torch.float64)
+        state["exp_avg"] = torch.rand(10, requires_grad=True, dtype=torch.float64)
+        state["exp_avg_sq"] = torch.rand(10, requires_grad=True, dtype=torch.float64)
+        state["max_exp_avg_sq"] = torch.rand(
+            10, requires_grad=True, dtype=torch.float64
+        )
+
+        # Have to pass in beta1 and beta2 separately
+        # so they're passed in as Tensors (not a tuple) and recognized by gradcheck.
+        # In the test, this is called: kwargs.update({betas: (beta1, beta2)})
+        kwargs: dict[str, Any] = {
+            "beta1": betas[0],
+            "beta2": betas[1],
+            "lr": lr,
+            "differentiable": True,
+        }
+
+        gradcheck(
+            _multistep_backprop_diff_hyperparams_fn,
+            (
+                params,
+                grad,
+                state,
+                Adam,
+                kwargs,  # includes betas
+                *state.values(),
+                *kwargs.values(),
+            ),
+        )
+
+    def test_adam_differentiable_all_hyperparams(self):
+        params = torch.rand(10, requires_grad=True, dtype=torch.float64)
+        grad = torch.rand_like(params, requires_grad=True, dtype=torch.float64)
+
+        lr = torch.tensor(0.001, requires_grad=True, dtype=torch.float64)
+        weight_decay = torch.tensor(0.999, requires_grad=True, dtype=torch.float64)
+        betas = (
+            torch.tensor(0.9, requires_grad=True, dtype=torch.float64),
+            torch.tensor(0.999, requires_grad=True, dtype=torch.float64),
+        )
+        state = {}
+        state["step"] = torch.tensor(10.0, requires_grad=False, dtype=torch.float64)
+        state["exp_avg"] = torch.rand(10, requires_grad=True, dtype=torch.float64)
+        state["exp_avg_sq"] = torch.rand(10, requires_grad=True, dtype=torch.float64)
+        state["max_exp_avg_sq"] = torch.rand(
+            10, requires_grad=True, dtype=torch.float64
+        )
+
+        # Have to pass in beta1 and beta2 separately
+        # so they're passed in as Tensors (not a tuple) and recognized by gradcheck.
+        # In the test, this is called: kwargs.update({betas: (beta1, beta2)})
+        kwargs: dict[str, Any] = {
+            "lr": lr,
+            "weight_decay": weight_decay,
+            "beta1": betas[0],
+            "beta2": betas[1],
+            "differentiable": True,
+        }
+
+        gradcheck(
+            _multistep_backprop_diff_hyperparams_fn,
+            (
+                params,
+                grad,
+                state,
+                Adam,
+                kwargs,  # includes betas
+                *state.values(),
+                *kwargs.values(),
+            ),
+        )
+
+    def test_adamw_differentiable_lr(self):
+        params = torch.rand(10, requires_grad=True, dtype=torch.float64)
+        grad = torch.rand_like(params, requires_grad=True, dtype=torch.float64)
+        lr = torch.tensor(0.001, requires_grad=True, dtype=torch.float64)
+
+        state = {}
+        state["step"] = torch.tensor(10.0, requires_grad=False, dtype=torch.float64)
+        state["exp_avg"] = torch.rand(10, requires_grad=True, dtype=torch.float64)
+        state["exp_avg_sq"] = torch.rand(10, requires_grad=True, dtype=torch.float64)
+        state["max_exp_avg_sq"] = torch.rand(
+            10, requires_grad=True, dtype=torch.float64
+        )
+        kwargs: dict[str, Any] = {"lr": lr, "differentiable": True}
+
+        gradcheck(
+            _multistep_backprop_diff_hyperparams_fn,
+            (
+                params,
+                grad,
+                state,
+                AdamW,
+                kwargs,  # includes lr
+                *state.values(),
+                *kwargs.values(),
+            ),
+        )
+
+    def test_adamw_differentiable_weight_decay(self):
+        params = torch.rand(10, requires_grad=True, dtype=torch.float64)
+        grad = torch.rand_like(params, requires_grad=True, dtype=torch.float64)
+        weight_decay = torch.tensor(0.999, requires_grad=True, dtype=torch.float64)
+
+        state = {}
+        state["step"] = torch.tensor(10.0, requires_grad=False, dtype=torch.float64)
+        state["exp_avg"] = torch.rand(10, requires_grad=True, dtype=torch.float64)
+        state["exp_avg_sq"] = torch.rand(10, requires_grad=True, dtype=torch.float64)
+        state["max_exp_avg_sq"] = torch.rand(
+            10, requires_grad=True, dtype=torch.float64
+        )
+        kwargs: dict[str, Any] = {"weight_decay": weight_decay, "differentiable": True}
+
+        gradcheck(
+            _multistep_backprop_diff_hyperparams_fn,
+            (
+                params,
+                grad,
+                state,
+                AdamW,
+                kwargs,  # includes weight_decay
+                *state.values(),
+                *kwargs.values(),
+            ),
+        )
+
+    def test_adamw_differentiable_betas(self):
+        params = torch.rand(10, requires_grad=True, dtype=torch.float64)
+        grad = torch.rand_like(params, requires_grad=True, dtype=torch.float64)
+
+        betas = (
+            torch.tensor(0.9, requires_grad=True, dtype=torch.float64),
+            torch.tensor(0.999, requires_grad=True, dtype=torch.float64),
+        )
+        state = {}
+        state["step"] = torch.tensor(10.0, requires_grad=False, dtype=torch.float64)
+        state["exp_avg"] = torch.rand(10, requires_grad=True, dtype=torch.float64)
+        state["exp_avg_sq"] = torch.rand(10, requires_grad=True, dtype=torch.float64)
+        state["max_exp_avg_sq"] = torch.rand(
+            10, requires_grad=True, dtype=torch.float64
+        )
+
+        # Have to pass in beta1 and beta2 separately
+        # so they're passed in as Tensors (not a tuple) and recognized by gradcheck.
+        # In the test, this is called: kwargs.update({betas: (beta1, beta2)})
+        kwargs: dict[str, Any] = {
+            "beta1": betas[0],
+            "beta2": betas[1],
+            "differentiable": True,
+        }
+
+        gradcheck(
+            _multistep_backprop_diff_hyperparams_fn,
+            (
+                params,
+                grad,
+                state,
+                AdamW,
+                kwargs,  # includes betas
+                *state.values(),
+                *kwargs.values(),
+            ),
+        )
+
+    def test_adamw_differentiable_all_hyperparams(self):
+        params = torch.rand(10, requires_grad=True, dtype=torch.float64)
+        grad = torch.rand_like(params, requires_grad=True, dtype=torch.float64)
+
+        lr = torch.tensor(0.001, requires_grad=True, dtype=torch.float64)
+        weight_decay = torch.tensor(0.999, requires_grad=True, dtype=torch.float64)
+        betas = (
+            torch.tensor(0.9, requires_grad=True, dtype=torch.float64),
+            torch.tensor(0.999, requires_grad=True, dtype=torch.float64),
+        )
+        state = {}
+        state["step"] = torch.tensor(10.0, requires_grad=False, dtype=torch.float64)
+        state["exp_avg"] = torch.rand(10, requires_grad=True, dtype=torch.float64)
+        state["exp_avg_sq"] = torch.rand(10, requires_grad=True, dtype=torch.float64)
+        state["max_exp_avg_sq"] = torch.rand(
+            10, requires_grad=True, dtype=torch.float64
+        )
+
+        # Have to pass in beta1 and beta2 separately
+        # so they're passed in as Tensors (not a tuple) and recognized by gradcheck.
+        # In the test, this is called: kwargs.update({betas: (beta1, beta2)})
+        kwargs: dict[str, Any] = {
+            "lr": lr,
+            "weight_decay": weight_decay,
+            "beta1": betas[0],
+            "beta2": betas[1],
+            "differentiable": True,
+        }
+
+        gradcheck(
+            _multistep_backprop_diff_hyperparams_fn,
+            (
+                params,
+                grad,
+                state,
+                AdamW,
+                kwargs,  # includes betas
+                *state.values(),
+                *kwargs.values(),
+            ),
+        )
+
     def test_differentiable_lr(self):
         params = torch.rand(10, requires_grad=True, dtype=torch.float64)
         grad = torch.rand_like(params, requires_grad=True, dtype=torch.float64)
diff --git a/test/package/test_save_load.py b/test/package/test_save_load.py
index 2f800c68792..a0cc967787e 100644
--- a/test/package/test_save_load.py
+++ b/test/package/test_save_load.py
@@ -2,8 +2,11 @@
 
 import pickle
 from io import BytesIO
+from sys import version_info
 from textwrap import dedent
+from unittest import skipIf
 
+import torch
 from torch.package import PackageExporter, PackageImporter, sys_importer
 from torch.testing._internal.common_utils import run_tests
 
@@ -265,6 +268,20 @@ class TestSaveLoad(PackageTestCase):
             exporter.intern("**")
             exporter.save_module("package_a.use_torch_package_importer")
 
+    @skipIf(version_info >= (3, 13), "https://github.com/pytorch/pytorch/issues/142170")
+    def test_save_load_fp8(self):
+        tensor = torch.rand(20, 20).to(torch.float8_e4m3fn)
+
+        buffer = BytesIO()
+        with PackageExporter(buffer) as exporter:
+            exporter.save_pickle("fp8_model", "model.pkl", tensor)
+
+        buffer.seek(0)
+
+        importer = PackageImporter(buffer)
+        loaded_tensor = importer.load_pickle("fp8_model", "model.pkl")
+        self.assertTrue(torch.equal(tensor, loaded_tensor))
+
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/quantization/core/test_docs.py b/test/quantization/core/test_docs.py
index 64623669924..2222ef64b62 100644
--- a/test/quantization/core/test_docs.py
+++ b/test/quantization/core/test_docs.py
@@ -51,7 +51,7 @@ class TestQuantizationDocs(QuantizationTestCase):
                 "been updated to have the correct relative path between "
                 "test_docs.py and the docs."
             )
-            pytorch_root = core_dir.parent.parent.parent
+            pytorch_root = core_dir.parents[2]
             return pytorch_root / path_from_pytorch
 
         path_to_file = get_correct_path(path_from_pytorch)
diff --git a/test/test_typing.py b/test/test_typing.py
index 7df3096fcc6..bd7998fee7f 100644
--- a/test/test_typing.py
+++ b/test/test_typing.py
@@ -30,7 +30,7 @@ DATA_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "typing"))
 REVEAL_DIR = os.path.join(DATA_DIR, "reveal")
 PASS_DIR = os.path.join(DATA_DIR, "pass")
 FAIL_DIR = os.path.join(DATA_DIR, "fail")
-MYPY_INI = os.path.join(DATA_DIR, os.pardir, os.pardir, "mypy.ini")
+MYPY_INI = os.path.join(os.path.dirname(os.path.dirname(DATA_DIR)), "mypy.ini")
 CACHE_DIR = os.path.join(DATA_DIR, ".mypy_cache")
 
 
diff --git a/third_party/xpu.txt b/third_party/xpu.txt
index 5572f097f22..1387a1102f6 100644
--- a/third_party/xpu.txt
+++ b/third_party/xpu.txt
@@ -1 +1 @@
-7ecb0b1a56b65dec63837a30972a8ba6f8432477
+214f33b9d969930a18656a82b5c5d8da53cdcb8e
diff --git a/tools/amd_build/build_amd.py b/tools/amd_build/build_amd.py
index 60a1be73fbb..26af7eec1fb 100755
--- a/tools/amd_build/build_amd.py
+++ b/tools/amd_build/build_amd.py
@@ -4,15 +4,16 @@
 import argparse
 import os
 import sys
+from pathlib import Path
 
 
-sys.path.append(
-    os.path.realpath(
-        os.path.join(
-            __file__, os.path.pardir, os.path.pardir, os.path.pardir, "torch", "utils"
-        )
-    )
-)
+# NOTE: `tools/amd_build/build_amd.py` could be a symlink.
+# The behavior of `symlink / '..'` is different from `symlink.parent`.
+# Use `pardir` three times rather than using `path.parents[2]`.
+REPO_ROOT = (
+    Path(__file__).absolute() / os.path.pardir / os.path.pardir / os.path.pardir
+).resolve()
+sys.path.append(str(REPO_ROOT / "torch" / "utils"))
 
 from hipify import hipify_python  # type: ignore[import]
 
@@ -53,8 +54,9 @@ parser.add_argument(
 
 args = parser.parse_args()
 
+# NOTE: `tools/amd_build/build_amd.py` could be a symlink.
 amd_build_dir = os.path.dirname(os.path.realpath(__file__))
-proj_dir = os.path.join(os.path.dirname(os.path.dirname(amd_build_dir)))
+proj_dir = os.path.dirname(os.path.dirname(amd_build_dir))
 
 if args.project_directory:
     proj_dir = args.project_directory
diff --git a/tools/build_libtorch.py b/tools/build_libtorch.py
index b1cda9575c9..192b75c75a0 100644
--- a/tools/build_libtorch.py
+++ b/tools/build_libtorch.py
@@ -1,13 +1,13 @@
 import argparse
 import sys
-from os.path import abspath, dirname
+from pathlib import Path
 
 
-# By appending pytorch_root to sys.path, this module can import other torch
+# By appending REPO_ROOT to sys.path, this module can import other torch
 # modules even when run as a standalone script. i.e., it's okay either you
 # do `python build_libtorch.py` or `python -m tools.build_libtorch`.
-pytorch_root = dirname(dirname(abspath(__file__)))
-sys.path.append(pytorch_root)
+REPO_ROOT = Path(__file__).absolute().parent.parent
+sys.path.append(str(REPO_ROOT))
 
 from tools.build_pytorch_libs import build_pytorch
 from tools.setup_helpers.cmake import CMake
diff --git a/tools/code_coverage/package/oss/utils.py b/tools/code_coverage/package/oss/utils.py
index c4019d76289..a5a5e5112a5 100644
--- a/tools/code_coverage/package/oss/utils.py
+++ b/tools/code_coverage/package/oss/utils.py
@@ -43,9 +43,7 @@ def get_llvm_tool_path() -> str:
 def get_pytorch_folder() -> str:
     # TOOLS_FOLDER in oss: pytorch/tools/code_coverage
     return os.path.abspath(
-        os.environ.get(
-            "PYTORCH_FOLDER", os.path.join(TOOLS_FOLDER, os.path.pardir, os.path.pardir)
-        )
+        os.environ.get("PYTORCH_FOLDER", os.path.dirname(os.path.dirname(TOOLS_FOLDER)))
     )
 
 
diff --git a/tools/code_coverage/package/util/setting.py b/tools/code_coverage/package/util/setting.py
index 42a94409b5e..9d31fa58f14 100644
--- a/tools/code_coverage/package/util/setting.py
+++ b/tools/code_coverage/package/util/setting.py
@@ -2,13 +2,12 @@ from __future__ import annotations
 
 import os
 from enum import Enum
+from pathlib import Path
 
 
 # <project folder>
 HOME_DIR = os.environ["HOME"]
-TOOLS_FOLDER = os.path.join(
-    os.path.dirname(os.path.realpath(__file__)), os.path.pardir, os.path.pardir
-)
+TOOLS_FOLDER = str(Path(__file__).resolve().parents[2])
 
 
 # <profile folder>
diff --git a/tools/gen_vulkan_spv.py b/tools/gen_vulkan_spv.py
index a64fb45591f..767ba3925a7 100644
--- a/tools/gen_vulkan_spv.py
+++ b/tools/gen_vulkan_spv.py
@@ -10,24 +10,28 @@ import glob
 import io
 import os
 import re
-import sys
-from itertools import product
-
-sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
 import subprocess
+import sys
 import textwrap
 from dataclasses import dataclass
+from itertools import product
+from pathlib import Path
 from typing import Any
 
 import yaml
 from yaml.constructor import ConstructorError
 from yaml.nodes import MappingNode
 
+
 try:
     from yaml import CLoader as Loader
 except ImportError:
     from yaml import Loader  # type: ignore[assignment, misc]
 
+
+REPO_ROOT = Path(__file__).absolute().parent.parent
+sys.path.append(str(REPO_ROOT))
+
 CPP_H_NAME = "spv.h"
 CPP_SRC_NAME = "spv.cpp"
 
diff --git a/tools/linter/adapters/s3_init.py b/tools/linter/adapters/s3_init.py
index 3f2649bd225..80e61efb612 100644
--- a/tools/linter/adapters/s3_init.py
+++ b/tools/linter/adapters/s3_init.py
@@ -26,10 +26,7 @@ try:
     PYTORCH_ROOT = result.stdout.decode("utf-8").strip()
 except subprocess.CalledProcessError:
     # If git is not installed, compute repo root as 3 folders up from this file
-    path_ = os.path.abspath(__file__)
-    for _ in range(4):
-        path_ = os.path.dirname(path_)
-    PYTORCH_ROOT = path_
+    PYTORCH_ROOT = str(Path(__file__).absolute().parents[3])
 
 DRY_RUN = False
 
diff --git a/tools/onnx/update_default_opset_version.py b/tools/onnx/update_default_opset_version.py
index 8c9710da37b..88a98e5b27c 100755
--- a/tools/onnx/update_default_opset_version.py
+++ b/tools/onnx/update_default_opset_version.py
@@ -30,7 +30,7 @@ def read_sub_write(path: str, prefix_pat: str, new_default: int) -> None:
 
 
 def main(args: Any) -> None:
-    pytorch_dir = Path(__file__).parent.parent.parent.resolve()
+    pytorch_dir = Path(__file__).parents[2].resolve()
     onnx_dir = pytorch_dir / "third_party" / "onnx"
     os.chdir(onnx_dir)
 
diff --git a/tools/setup_helpers/cmake.py b/tools/setup_helpers/cmake.py
index 84e4dad32d3..abb9aa4890d 100644
--- a/tools/setup_helpers/cmake.py
+++ b/tools/setup_helpers/cmake.py
@@ -8,6 +8,7 @@ import platform
 import sys
 import sysconfig
 from distutils.version import LooseVersion
+from pathlib import Path
 from subprocess import CalledProcessError, check_call, check_output
 from typing import Any, cast
 
@@ -173,9 +174,7 @@ class CMake:
                 toolset_expr = ",".join([f"{k}={v}" for k, v in toolset_dict.items()])
                 args.append("-T" + toolset_expr)
 
-        base_dir = os.path.dirname(
-            os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
-        )
+        base_dir = str(Path(__file__).absolute().parents[2])
         install_dir = os.path.join(base_dir, "torch")
 
         _mkdir_p(install_dir)
diff --git a/tools/setup_helpers/gen.py b/tools/setup_helpers/gen.py
index d7e63d9ed4a..fb3b21fbc8c 100644
--- a/tools/setup_helpers/gen.py
+++ b/tools/setup_helpers/gen.py
@@ -1,11 +1,11 @@
 # Little stub file to get BUILD.bazel to play along
 
-import os.path
 import sys
+from pathlib import Path
 
 
-root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-sys.path.insert(0, root)
+REPO_ROOT = Path(__file__).absolute().parents[2]
+sys.path.insert(0, str(REPO_ROOT))
 
 import torchgen.gen
 
diff --git a/tools/setup_helpers/gen_unboxing.py b/tools/setup_helpers/gen_unboxing.py
index 91c61f0ab20..6e733d71059 100644
--- a/tools/setup_helpers/gen_unboxing.py
+++ b/tools/setup_helpers/gen_unboxing.py
@@ -1,11 +1,11 @@
 # Little stub file to get BUILD.bazel to play along
 
-import os.path
 import sys
+from pathlib import Path
 
 
-root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-sys.path.insert(0, root)
+REPO_ROOT = Path(__file__).absolute().parents[2]
+sys.path.insert(0, str(REPO_ROOT))
 
 import tools.jit.gen_unboxing
 
diff --git a/tools/setup_helpers/generate_code.py b/tools/setup_helpers/generate_code.py
index 9fee1909970..6e0a64888f0 100644
--- a/tools/setup_helpers/generate_code.py
+++ b/tools/setup_helpers/generate_code.py
@@ -15,6 +15,7 @@ try:
 except ImportError:
     from yaml import SafeLoader as YamlLoader  # type: ignore[assignment, misc]
 
+
 NATIVE_FUNCTIONS_PATH = "aten/src/ATen/native/native_functions.yaml"
 TAGS_PATH = "aten/src/ATen/native/tags.yaml"
 
@@ -110,8 +111,9 @@ def get_selector(
     operators_yaml_path: str | None,
 ) -> Any:
     # cwrap depends on pyyaml, so we can't import it earlier
-    root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-    sys.path.insert(0, root)
+    REPO_ROOT = Path(__file__).absolute().parents[2]
+    sys.path.insert(0, str(REPO_ROOT))
+
     from torchgen.selective_build.selector import SelectiveBuilder
 
     assert not (
diff --git a/tools/stats/export_test_times.py b/tools/stats/export_test_times.py
index 9bfb6b3810b..ae877188330 100644
--- a/tools/stats/export_test_times.py
+++ b/tools/stats/export_test_times.py
@@ -2,8 +2,9 @@ import sys
 from pathlib import Path
 
 
-REPO_ROOT = Path(__file__).resolve().parent.parent.parent
+REPO_ROOT = Path(__file__).resolve().parents[2]
 sys.path.append(str(REPO_ROOT))
+
 from tools.stats.import_test_stats import get_test_class_times, get_test_times
 
 
diff --git a/tools/stats/import_test_stats.py b/tools/stats/import_test_stats.py
index 57ceb9b8b49..ca4d8e4a3b4 100644
--- a/tools/stats/import_test_stats.py
+++ b/tools/stats/import_test_stats.py
@@ -11,7 +11,7 @@ from typing import Any, Callable, cast, Dict
 from urllib.request import urlopen
 
 
-REPO_ROOT = Path(__file__).resolve().parent.parent.parent
+REPO_ROOT = Path(__file__).resolve().parents[2]
 
 
 def get_disabled_issues() -> list[str]:
diff --git a/tools/test/heuristics/test_heuristics.py b/tools/test/heuristics/test_heuristics.py
index a4729261503..575d1b5732b 100644
--- a/tools/test/heuristics/test_heuristics.py
+++ b/tools/test/heuristics/test_heuristics.py
@@ -10,7 +10,7 @@ from typing import Any
 from unittest import mock
 
 
-REPO_ROOT = Path(__file__).resolve().parent.parent.parent.parent
+REPO_ROOT = Path(__file__).resolve().parents[3]
 sys.path.append(str(REPO_ROOT))
 
 from tools.test.heuristics.test_interface import TestTD
diff --git a/tools/test/heuristics/test_interface.py b/tools/test/heuristics/test_interface.py
index 59b6c13e639..a51ab939403 100644
--- a/tools/test/heuristics/test_interface.py
+++ b/tools/test/heuristics/test_interface.py
@@ -6,7 +6,7 @@ from pathlib import Path
 from typing import Any
 
 
-REPO_ROOT = Path(__file__).resolve().parent.parent.parent.parent
+REPO_ROOT = Path(__file__).resolve().parents[3]
 sys.path.append(str(REPO_ROOT))
 
 import tools.testing.target_determination.heuristics.interface as interface
diff --git a/tools/test/heuristics/test_utils.py b/tools/test/heuristics/test_utils.py
index 6deb797d31c..e1f47b8453e 100644
--- a/tools/test/heuristics/test_utils.py
+++ b/tools/test/heuristics/test_utils.py
@@ -6,7 +6,7 @@ from pathlib import Path
 from typing import Any
 
 
-REPO_ROOT = Path(__file__).resolve().parent.parent.parent.parent
+REPO_ROOT = Path(__file__).resolve().parents[3]
 sys.path.append(str(REPO_ROOT))
 
 import tools.testing.target_determination.heuristics.utils as utils
diff --git a/tools/test/test_gen_backend_stubs.py b/tools/test/test_gen_backend_stubs.py
index 303c6f227a8..5c7b5e4cfc8 100644
--- a/tools/test/test_gen_backend_stubs.py
+++ b/tools/test/test_gen_backend_stubs.py
@@ -2,7 +2,6 @@
 
 from __future__ import annotations
 
-import os
 import tempfile
 import unittest
 
@@ -12,10 +11,6 @@ from torchgen.gen import _GLOBAL_PARSE_NATIVE_YAML_CACHE  # noqa: F401
 from torchgen.gen_backend_stubs import run
 
 
-path = os.path.dirname(os.path.realpath(__file__))
-gen_backend_stubs_path = os.path.join(path, "../torchgen/gen_backend_stubs.py")
-
-
 # gen_backend_stubs.py is an integration point that is called directly by external backends.
 # The tests here are to confirm that badly formed inputs result in reasonable error messages.
 class TestGenBackendStubs(expecttest.TestCase):
diff --git a/tools/test/test_test_run.py b/tools/test/test_test_run.py
index 7e9a8f6685c..c3fc2736f84 100644
--- a/tools/test/test_test_run.py
+++ b/tools/test/test_test_run.py
@@ -3,7 +3,7 @@ import unittest
 from pathlib import Path
 
 
-REPO_ROOT = Path(__file__).resolve().parent.parent.parent
+REPO_ROOT = Path(__file__).resolve().parents[2]
 try:
     # using tools/ to optimize test run.
     sys.path.append(str(REPO_ROOT))
diff --git a/tools/test/test_test_selections.py b/tools/test/test_test_selections.py
index 5e3e7a949fa..f5164ddbc3a 100644
--- a/tools/test/test_test_selections.py
+++ b/tools/test/test_test_selections.py
@@ -8,7 +8,7 @@ from collections import defaultdict
 from pathlib import Path
 
 
-REPO_ROOT = Path(__file__).resolve().parent.parent.parent
+REPO_ROOT = Path(__file__).resolve().parents[2]
 try:
     # using tools/ to optimize test run.
     sys.path.append(str(REPO_ROOT))
diff --git a/tools/test/test_upload_stats_lib.py b/tools/test/test_upload_stats_lib.py
index 996c87a3942..8d2a7e639d2 100644
--- a/tools/test/test_upload_stats_lib.py
+++ b/tools/test/test_upload_stats_lib.py
@@ -10,7 +10,7 @@ from typing import Any
 from unittest import mock
 
 
-REPO_ROOT = Path(__file__).resolve().parent.parent.parent
+REPO_ROOT = Path(__file__).resolve().parents[2]
 sys.path.insert(0, str(REPO_ROOT))
 
 from tools.stats.upload_metrics import add_global_metric, emit_metric, global_metrics
diff --git a/tools/testing/discover_tests.py b/tools/testing/discover_tests.py
index 3cebaf44756..614d036b45a 100644
--- a/tools/testing/discover_tests.py
+++ b/tools/testing/discover_tests.py
@@ -9,7 +9,7 @@ from pathlib import Path
 CPP_TEST_PREFIX = "cpp"
 CPP_TEST_PATH = "build/bin"
 CPP_TESTS_DIR = os.path.abspath(os.getenv("CPP_TESTS_DIR", default=CPP_TEST_PATH))
-REPO_ROOT = Path(__file__).resolve().parent.parent.parent
+REPO_ROOT = Path(__file__).resolve().parents[2]
 
 
 def parse_test_module(test: str) -> str:
diff --git a/tools/testing/do_target_determination_for_s3.py b/tools/testing/do_target_determination_for_s3.py
index a280e5bfaf2..27a0fbb5b42 100644
--- a/tools/testing/do_target_determination_for_s3.py
+++ b/tools/testing/do_target_determination_for_s3.py
@@ -4,7 +4,7 @@ import sys
 from pathlib import Path
 
 
-REPO_ROOT = Path(__file__).resolve().parent.parent.parent
+REPO_ROOT = Path(__file__).resolve().parents[2]
 sys.path.insert(0, str(REPO_ROOT))
 
 from tools.stats.import_test_stats import (
diff --git a/tools/testing/explicit_ci_jobs.py b/tools/testing/explicit_ci_jobs.py
index bc7736194f4..dcf40647235 100755
--- a/tools/testing/explicit_ci_jobs.py
+++ b/tools/testing/explicit_ci_jobs.py
@@ -12,7 +12,7 @@ from typing import Any
 import yaml
 
 
-REPO_ROOT = Path(__file__).parent.parent.parent
+REPO_ROOT = Path(__file__).parents[2]
 CONFIG_YML = REPO_ROOT / ".circleci" / "config.yml"
 WORKFLOWS_DIR = REPO_ROOT / ".github" / "workflows"
 
diff --git a/tools/testing/modulefinder_determinator.py b/tools/testing/modulefinder_determinator.py
index 760fed9ad4e..01f99e745d0 100644
--- a/tools/testing/modulefinder_determinator.py
+++ b/tools/testing/modulefinder_determinator.py
@@ -8,7 +8,7 @@ from pathlib import Path
 from typing import Any
 
 
-REPO_ROOT = Path(__file__).resolve().parent.parent.parent
+REPO_ROOT = Path(__file__).resolve().parents[2]
 
 # These tests are slow enough that it's worth calculating whether the patch
 # touched any related files first. This list was manually generated, but for every
diff --git a/tools/testing/target_determination/gen_artifact.py b/tools/testing/target_determination/gen_artifact.py
index e6576979de9..a28a05bd2bd 100644
--- a/tools/testing/target_determination/gen_artifact.py
+++ b/tools/testing/target_determination/gen_artifact.py
@@ -6,7 +6,7 @@ from pathlib import Path
 from typing import Any
 
 
-REPO_ROOT = Path(__file__).resolve().parent.parent.parent.parent
+REPO_ROOT = Path(__file__).resolve().parents[3]
 
 
 def gen_ci_artifact(included: list[Any], excluded: list[Any]) -> None:
diff --git a/tools/testing/target_determination/heuristics/filepath.py b/tools/testing/target_determination/heuristics/filepath.py
index ae1ef5ab260..0005ba5df5e 100644
--- a/tools/testing/target_determination/heuristics/filepath.py
+++ b/tools/testing/target_determination/heuristics/filepath.py
@@ -17,7 +17,7 @@ from tools.testing.target_determination.heuristics.utils import (
 from tools.testing.test_run import TestRun
 
 
-REPO_ROOT = Path(__file__).parent.parent.parent.parent
+REPO_ROOT = Path(__file__).parents[3]
 
 keyword_synonyms: dict[str, list[str]] = {
     "amp": ["mixed_precision"],
diff --git a/tools/testing/target_determination/heuristics/llm.py b/tools/testing/target_determination/heuristics/llm.py
index b046f96dafb..6c6a4b1be21 100644
--- a/tools/testing/target_determination/heuristics/llm.py
+++ b/tools/testing/target_determination/heuristics/llm.py
@@ -16,7 +16,7 @@ from tools.testing.target_determination.heuristics.utils import normalize_rating
 from tools.testing.test_run import TestRun
 
 
-REPO_ROOT = Path(__file__).resolve().parent.parent.parent.parent.parent
+REPO_ROOT = Path(__file__).resolve().parents[4]
 
 
 class LLM(HeuristicInterface):
diff --git a/tools/testing/target_determination/heuristics/previously_failed_in_pr.py b/tools/testing/target_determination/heuristics/previously_failed_in_pr.py
index a17145a7eca..bf0a9549cc9 100644
--- a/tools/testing/target_determination/heuristics/previously_failed_in_pr.py
+++ b/tools/testing/target_determination/heuristics/previously_failed_in_pr.py
@@ -20,7 +20,7 @@ from tools.testing.target_determination.heuristics.utils import (
 from tools.testing.test_run import TestRun
 
 
-REPO_ROOT = Path(__file__).resolve().parent.parent.parent.parent.parent
+REPO_ROOT = Path(__file__).resolve().parents[4]
 
 
 class PreviouslyFailedInPR(HeuristicInterface):
diff --git a/tools/testing/target_determination/heuristics/utils.py b/tools/testing/target_determination/heuristics/utils.py
index 86a71b76868..d9e9b002e37 100644
--- a/tools/testing/target_determination/heuristics/utils.py
+++ b/tools/testing/target_determination/heuristics/utils.py
@@ -15,7 +15,8 @@ from warnings import warn
 if TYPE_CHECKING:
     from tools.testing.test_run import TestRun
 
-REPO_ROOT = Path(__file__).resolve().parent.parent.parent.parent.parent
+
+REPO_ROOT = Path(__file__).resolve().parents[4]
 
 
 def python_test_file_to_test_name(tests: set[str]) -> set[str]:
diff --git a/tools/testing/test_selections.py b/tools/testing/test_selections.py
index b74f5b954c4..635f512f087 100644
--- a/tools/testing/test_selections.py
+++ b/tools/testing/test_selections.py
@@ -14,7 +14,7 @@ if TYPE_CHECKING:
     from collections.abc import Sequence
 
 
-REPO_ROOT = Path(__file__).resolve().parent.parent.parent
+REPO_ROOT = Path(__file__).resolve().parents[2]
 
 IS_MEM_LEAK_CHECK = os.getenv("PYTORCH_TEST_CUDA_MEM_LEAK_CHECK", "0") == "1"
 BUILD_ENVIRONMENT = os.getenv("BUILD_ENVIRONMENT", "")
diff --git a/torch/_dynamo/external_utils.py b/torch/_dynamo/external_utils.py
index 6d885973ef4..a54f0beb9f2 100644
--- a/torch/_dynamo/external_utils.py
+++ b/torch/_dynamo/external_utils.py
@@ -2,8 +2,8 @@
 
 import functools
 import warnings
-from typing import Any, Callable, List, Optional, TYPE_CHECKING, Union
-from typing_extensions import deprecated
+from typing import Any, Callable, List, Optional, TYPE_CHECKING, TypeVar, Union
+from typing_extensions import deprecated, ParamSpec
 
 import torch
 import torch.utils._pytree as pytree
@@ -14,6 +14,9 @@ try:
 except ModuleNotFoundError:
     np = None  # type: ignore[assignment]
 
+_P = ParamSpec("_P")
+_R = TypeVar("_R")
+
 if TYPE_CHECKING:
     # TorchScript does not support `@deprecated`
     # This is a workaround to avoid breaking TorchScript
@@ -35,13 +38,13 @@ else:
         return torch.compiler.is_compiling()
 
 
-def wrap_inline(fn: Callable[..., Any]) -> Callable[..., Any]:
+def wrap_inline(fn: Callable[_P, _R]) -> Callable[_P, _R]:
     """
     Create an extra frame around fn that is not in skipfiles.
     """
 
     @functools.wraps(fn)
-    def inner(*args: Any, **kwargs: Any) -> Any:
+    def inner(*args: _P.args, **kwargs: _P.kwargs) -> _R:
         return fn(*args, **kwargs)
 
     return inner
@@ -61,7 +64,7 @@ def call_hook(
     return result
 
 
-def wrap_numpy(f: Callable[..., Any]) -> Callable[..., Any]:
+def wrap_numpy(f: Callable[_P, _R]) -> Callable[_P, _R]:
     r"""Decorator that turns a function from ``np.ndarray``s to ``np.ndarray``s into a function
     from ``torch.Tensor``s to ``torch.Tensor``s.
     """
@@ -69,7 +72,7 @@ def wrap_numpy(f: Callable[..., Any]) -> Callable[..., Any]:
         return f
 
     @functools.wraps(f)
-    def wrap(*args: Any, **kwargs: Any) -> Any:
+    def wrap(*args: _P.args, **kwargs: _P.kwargs) -> pytree.PyTree:
         args, kwargs = pytree.tree_map_only(
             torch.Tensor, lambda x: x.numpy(), (args, kwargs)
         )
diff --git a/torch/_dynamo/testing.py b/torch/_dynamo/testing.py
index f8583b1bd05..d401c83f0a8 100644
--- a/torch/_dynamo/testing.py
+++ b/torch/_dynamo/testing.py
@@ -20,6 +20,7 @@ from typing import (
     TypeVar,
     Union,
 )
+from typing_extensions import ParamSpec
 from unittest.mock import patch
 
 import torch
@@ -51,6 +52,8 @@ three = 3
 
 log = logging.getLogger(__name__)
 
+_P = ParamSpec("_P")
+
 
 def clone_me(x: Optional[torch.Tensor]) -> Optional[torch.Tensor]:
     if x is None:
@@ -407,9 +410,9 @@ def check_dynamic_shape_capture() -> bool:
     return not config.assume_static_by_default
 
 
-def _make_fn_with_patches(fn: Callable[..., _T], *patches: Any) -> Callable[..., _T]:
+def _make_fn_with_patches(fn: Callable[_P, _T], *patches: Any) -> Callable[_P, _T]:
     @functools.wraps(fn)
-    def _fn(*args: Any, **kwargs: Any) -> _T:
+    def _fn(*args: _P.args, **kwargs: _P.kwargs) -> _T:
         with contextlib.ExitStack() as stack:
             for module, attr, val in patches:
                 stack.enter_context(patch.object(module, attr, val))
diff --git a/torch/_inductor/runtime/compile_tasks.py b/torch/_inductor/runtime/compile_tasks.py
index 17788ab7920..3a962687571 100644
--- a/torch/_inductor/runtime/compile_tasks.py
+++ b/torch/_inductor/runtime/compile_tasks.py
@@ -5,6 +5,7 @@ import functools
 import os
 import sys
 import warnings
+from pathlib import Path
 from types import ModuleType
 from typing import Any, Callable, Dict
 
@@ -51,15 +52,13 @@ def _reload_python_module(key, path):
 def _set_triton_ptxas_path() -> None:
     if os.environ.get("TRITON_PTXAS_PATH") is not None:
         return
-    ptxas_path = os.path.abspath(
-        os.path.join(os.path.dirname(__file__), "..", "bin", "ptxas")
-    )
-    if not os.path.exists(ptxas_path):
+    ptxas = Path(__file__).absolute().parents[1] / "bin" / "ptxas"
+    if not ptxas.exists():
         return
-    if os.path.isfile(ptxas_path) and os.access(ptxas_path, os.X_OK):
-        os.environ["TRITON_PTXAS_PATH"] = ptxas_path
+    if ptxas.is_file() and os.access(ptxas, os.X_OK):
+        os.environ["TRITON_PTXAS_PATH"] = str(ptxas)
     else:
-        warnings.warn(f"{ptxas_path} exists but is not an executable")
+        warnings.warn(f"{ptxas} exists but is not an executable")
 
 
 def _worker_compile_triton(load_kernel: Callable[[], Any], extra_env: Dict[str, str]):
diff --git a/torch/_strobelight/cli_function_profiler.py b/torch/_strobelight/cli_function_profiler.py
index f228061afa6..e45fe6177ba 100644
--- a/torch/_strobelight/cli_function_profiler.py
+++ b/torch/_strobelight/cli_function_profiler.py
@@ -8,7 +8,8 @@ import subprocess
 import time
 from threading import Lock
 from timeit import default_timer as timer
-from typing import Any, List, Optional, Sequence
+from typing import Any, Callable, List, Optional, Sequence, TypeVar
+from typing_extensions import ParamSpec
 
 
 logger = logging.getLogger("strobelight_function_profiler")
@@ -23,6 +24,9 @@ logger.addHandler(console_handler)
 logger.setLevel(logging.INFO)
 logger.propagate = False
 
+_P = ParamSpec("_P")
+_R = TypeVar("_R")
+
 
 class StrobelightCLIProfilerError(Exception):
     """
@@ -250,7 +254,9 @@ class StrobelightCLIFunctionProfiler:
                 self._stop_strobelight_no_throw(collect_results=False)
             return False
 
-    def profile(self, work_function: Any, *args: Any, **kwargs: Any) -> Any:
+    def profile(
+        self, work_function: Callable[_P, _R], *args: _P.args, **kwargs: _P.kwargs
+    ) -> Optional[_R]:
         self.current_run_id = None
         self.profile_result = None
 
@@ -288,6 +294,7 @@ class StrobelightCLIFunctionProfiler:
                 self._stop_strobelight_no_throw(collect_results=False)
                 StrobelightCLIFunctionProfiler._lock.release()
                 raise error
+        return None
 
 
 # A function decorator that wraps profile, if no profiler is provided one with
@@ -297,13 +304,15 @@ class StrobelightCLIFunctionProfiler:
 # @strobelight(stop_at_error=True,...)
 def strobelight(
     profiler: Optional[StrobelightCLIFunctionProfiler] = None, **kwargs: Any
-) -> Any:
+) -> Callable[[Callable[_P, _R]], Callable[_P, Optional[_R]]]:
     if not profiler:
         profiler = StrobelightCLIFunctionProfiler(**kwargs)
 
-    def strobelight_inner(work_function: Any) -> Any:
+    def strobelight_inner(
+        work_function: Callable[_P, _R]
+    ) -> Callable[_P, Optional[_R]]:
         @functools.wraps(work_function)
-        def wrapper_function(*args: Any, **kwargs: Any) -> Any:
+        def wrapper_function(*args: _P.args, **kwargs: _P.kwargs) -> Optional[_R]:
             return profiler.profile(work_function, *args, **kwargs)
 
         return wrapper_function
diff --git a/torch/_subclasses/fake_tensor.py b/torch/_subclasses/fake_tensor.py
index 01b5e3fd64f..005c7a47ebd 100644
--- a/torch/_subclasses/fake_tensor.py
+++ b/torch/_subclasses/fake_tensor.py
@@ -542,7 +542,7 @@ class FakeTensorConfig:
 #
 # Making this a descriptor may seem overly fancy, but actually it's the most
 # convenient way to ensure access to FakeTensor during access, which is
-# required for testing version counter and epoch validity.​
+# required for testing version counter and epoch validity.
 class SymNumberMemoDescriptor:
     _name: str
 
@@ -763,7 +763,7 @@ class FakeTensor(Tensor):
 
     @classmethod
     @count
-    def __torch_dispatch__(
+    def __torch_dispatch__(  # type: ignore[override] # TODO
         cls,
         func: OpOverload,
         types: Sequence[Type],
diff --git a/torch/distributed/_tools/fsdp2_mem_tracker.py b/torch/distributed/_tools/fsdp2_mem_tracker.py
index 49ec85ac725..fa12f7b4e72 100644
--- a/torch/distributed/_tools/fsdp2_mem_tracker.py
+++ b/torch/distributed/_tools/fsdp2_mem_tracker.py
@@ -1,7 +1,19 @@
 from copy import deepcopy
 from datetime import timedelta
 from functools import partial, wraps
-from typing import Any, Callable, Dict, List, NamedTuple, Optional, Tuple, Type, Union
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    List,
+    NamedTuple,
+    Optional,
+    Tuple,
+    Type,
+    TypeVar,
+    Union,
+)
+from typing_extensions import ParamSpec, TypeVarTuple, Unpack
 
 import torch
 import torch.distributed as dist
@@ -26,6 +38,10 @@ _TOTAL_KEY = "Total"
 
 __all__ = ["FSDPMemTracker"]
 
+_P = ParamSpec("_P")
+_R = TypeVar("_R")
+_Ts = TypeVarTuple("_Ts")
+
 
 class _FSDPRefType(_RefType):
     """
@@ -185,8 +201,8 @@ class FSDPMemTracker(MemTracker):
     def _fsdp_state_pre_forward(
         self,
         fsdp_mod: FSDPModule,
-        orig_fsdp_state_pre_fw: Callable,
-    ) -> Callable:
+        orig_fsdp_state_pre_fw: Callable[_P, Tuple[Tuple[Unpack[_Ts]], Dict[str, Any]]],
+    ) -> Callable[_P, Tuple[Tuple[Unpack[_Ts]], Dict[str, Any]]]:
         # We capture memory snapshots before and after ``FSDPState._pre_forward`` to attribute the `unsharded` params
         # and `all_gather` buffers.  There are three cases:
         # Case 1: If the module is not in the ``memory_tracking`` dictionary, create a new ``_FSDPModMemStats``
@@ -201,7 +217,9 @@ class FSDPMemTracker(MemTracker):
         # For Case 1 and 3, we also initialiaze the ``local_peak`` and ``PEAK_FW`` snapshot for the module.
         # For Case 2 we only capture 1 snapshot after ``FSDPState._pre_forward`` runs because it is a no-op.
         @wraps(orig_fsdp_state_pre_fw)
-        def inner(*args: Any, **kwargs: Any) -> Tuple[Tuple[Any, ...], Dict[str, Any]]:
+        def inner(
+            *args: _P.args, **kwargs: _P.kwargs
+        ) -> Tuple[Tuple[Unpack[_Ts]], Dict[str, Any]]:
             mod_fqn = self._mod_tracker.get_known_fqn(fsdp_mod)
             assert mod_fqn is not None
             if fsdp_mod not in self.memory_tracking:
@@ -251,15 +269,15 @@ class FSDPMemTracker(MemTracker):
     def _fsdp_state_post_forward(
         self,
         fsdp_mod: FSDPModule,
-        orig_fsdp_state_post_fw: Callable,
-    ) -> Callable:
+        orig_fsdp_state_post_fw: Callable[_P, _R],
+    ) -> Callable[_P, _R]:
         # We capture memory snapshots before and after ``FSDPState._post_forward`` to capture the resharded state
         # if ``reshard_after_forward`` is not ``False``. There are two cases:
         # Case 1: This is called in backward, which means we are in the AC region. If this is the top most module
         #         in the AC region, we set the flag ``_in_ac`` to False.
         # Case 2: This is called in forward.
         @wraps(orig_fsdp_state_post_fw)
-        def inner(*args: Any, **kwargs: Any) -> Any:
+        def inner(*args: _P.args, **kwargs: _P.kwargs) -> _R:
             mod_stat = self.memory_tracking[fsdp_mod]
             if self._mod_tracker.is_bw:
                 state = _FSDPModState.POST_FW_AC
@@ -283,12 +301,12 @@ class FSDPMemTracker(MemTracker):
     def _fsdp_param_group_pre_backward(
         self,
         fsdp_mod: FSDPModule,
-        orig_fsdp_param_group_pre_backward: Callable,
-    ) -> Callable:
+        orig_fsdp_param_group_pre_backward: Callable[_P, Any],
+    ) -> Callable[_P, None]:
         # We capture memory snapshots before and after ``FSDPParamGroup.pre_backward`` to capture the pre-fetching
         # and unsharding of params. We also initialize ``local_peak`` and ``PEAK_BW`` snapshot for the module.
         @wraps(orig_fsdp_param_group_pre_backward)
-        def inner(*args: Any, **kwargs: Any) -> None:
+        def inner(*args: _P.args, **kwargs: _P.kwargs) -> None:
             mod_stat = self.memory_tracking[fsdp_mod]
             snapshot = self.get_tracker_snapshot()
             mod_stat.local_peak = {
@@ -309,13 +327,13 @@ class FSDPMemTracker(MemTracker):
     def _fsdp_param_group_post_backward(
         self,
         fsdp_mod: FSDPModule,
-        orig_fsdp_param_group_post_backward: Callable,
-    ) -> Callable:
+        orig_fsdp_param_group_post_backward: Callable[_P, Any],
+    ) -> Callable[_P, None]:
         # We capture the memory snapshots before and after ``FSDPParamGroup.post_backward`` to track and attribute
         # the `unsharded` grads before the post backward and then `sharded` grads and `reduce_scatter`  buffers
         # after the post backward.
         @wraps(orig_fsdp_param_group_post_backward)
-        def inner(*args: Any, **kwargs: Any) -> None:
+        def inner(*args: _P.args, **kwargs: _P.kwargs) -> None:
             fsdp_state = fsdp_mod._get_fsdp_state()
             if fsdp_param_group := fsdp_state._fsdp_param_group:
                 for fsdp_param in fsdp_param_group.fsdp_params:
diff --git a/torch/onnx/_internal/_lazy_import.py b/torch/onnx/_internal/_lazy_import.py
index b0c23abd31b..e9d0340914c 100644
--- a/torch/onnx/_internal/_lazy_import.py
+++ b/torch/onnx/_internal/_lazy_import.py
@@ -1,6 +1,5 @@
 """Utility to lazily import modules."""
 
-# mypy: allow-untyped-defs
 from __future__ import annotations
 
 import importlib
@@ -17,7 +16,7 @@ class _LazyModule:
     def __repr__(self) -> str:
         return f"<lazy module '{self._name}'>"
 
-    def __getattr__(self, attr):
+    def __getattr__(self, attr: str) -> object:
         if self._module is None:
             self._module = importlib.import_module(".", self._name)
         return getattr(self._module, attr)
diff --git a/torch/optim/adam.py b/torch/optim/adam.py
index e3a628cb576..536c0c271c5 100644
--- a/torch/optim/adam.py
+++ b/torch/optim/adam.py
@@ -402,7 +402,14 @@ def _single_tensor_adam(
                 # Perform stepweight decay
                 param.mul_(1 - lr * weight_decay)
             else:
-                grad = grad.add(param, alpha=weight_decay)
+                # Nested if is necessary to bypass jitscript rules
+                if differentiable and isinstance(weight_decay, Tensor):
+                    if weight_decay.requires_grad:
+                        grad = grad.addcmul_(param.clone(), weight_decay)
+                    else:
+                        grad = grad.add(param, alpha=weight_decay)
+                else:
+                    grad = grad.add(param, alpha=weight_decay)
 
         if torch.is_complex(param):
             grad = torch.view_as_real(grad)
@@ -429,13 +436,43 @@ def _single_tensor_adam(
         # Decay the first and second moment running average coefficient
         exp_avg.lerp_(grad, 1 - device_beta1)
 
-        exp_avg_sq.mul_(beta2).addcmul_(grad, grad.conj(), value=1 - beta2)
+        # Nested if is necessary to bypass jitscript rules
+        if differentiable and isinstance(beta2, Tensor):
+            if beta2.requires_grad:
+                # Using lerp to only use 2 operations bc addcmul's value cannot be a tensor
+                # Showing equivalence of differentiable path and nondifferentiable path
+                # expavg * b2 + grad^2 * (1-b2)
+                #           add expavg * (1-b2) - expavg * (1-b2) = 0
+                # expavg * b2 + expavg * (1-b2) - expavg * (1-b2) + grad^2 * (1-b2)
+                # expavg - expavg * (1-b2) + grad^2 * (1-b2)
+                # expavg + (grad^2 - expavg) * (1-b2)
+                # expavg.lerp(grad^2, 1-beta2)
+                exp_avg_sq.lerp_(torch.square(grad), weight=1 - beta2)
+            else:
+                exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
+        else:
+            exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
 
         if capturable or differentiable:
             step = step_t
 
-            bias_correction1 = 1 - beta1**step
-            bias_correction2 = 1 - beta2**step
+            # Nested if is necessary to bypass jitscript rules
+            if differentiable and isinstance(beta1, Tensor):
+                if beta1.requires_grad:
+                    bias_correction1 = 1 - beta1 ** step.clone()
+                else:
+                    bias_correction1 = 1 - beta1**step
+            else:
+                bias_correction1 = 1 - beta1**step
+
+            # Nested if is necessary to bypass jitscript rules
+            if differentiable and isinstance(beta2, Tensor):
+                if beta2.requires_grad:
+                    bias_correction2 = 1 - beta2 ** step.clone()
+                else:
+                    bias_correction2 = 1 - beta2**step
+            else:
+                bias_correction2 = 1 - beta2**step
 
             step_size = lr / bias_correction1
             step_size_neg = step_size.neg()
@@ -462,7 +499,10 @@ def _single_tensor_adam(
                     exp_avg_sq.sqrt() / (bias_correction2_sqrt * step_size_neg)
                 ).add_(eps / step_size_neg)
 
-            param.addcdiv_(exp_avg, denom)
+            if differentiable:
+                param.addcdiv_(exp_avg.clone(), denom)
+            else:
+                param.addcdiv_(exp_avg, denom)
         else:
             step = _get_value(step_t)
 
diff --git a/torch/package/package_importer.py b/torch/package/package_importer.py
index 72bde854cba..a9a577c4fcb 100644
--- a/torch/package/package_importer.py
+++ b/torch/package/package_importer.py
@@ -260,7 +260,10 @@ class PackageImporter(Importer):
 
             if typename == "storage":
                 storage_type, key, location, size = data
-                dtype = storage_type.dtype
+                if storage_type is torch.UntypedStorage:
+                    dtype = torch.uint8
+                else:
+                    dtype = storage_type.dtype
 
                 if key not in loaded_storages:
                     load_tensor(
diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
index 2a9f90b2baf..bef72ad5455 100644
--- a/torch/testing/_internal/common_utils.py
+++ b/torch/testing/_internal/common_utils.py
@@ -5008,7 +5008,7 @@ def find_library_location(lib_name: str) -> Path:
     path = torch_root / 'lib' / lib_name
     if os.path.exists(path):
         return path
-    torch_root = Path(__file__).resolve().parent.parent.parent
+    torch_root = Path(__file__).resolve().parents[2]
     return torch_root / 'build' / 'lib' / lib_name
 
 def skip_but_pass_in_sandcastle(reason):
diff --git a/torch/utils/_stats.py b/torch/utils/_stats.py
index 148763d4970..6f2de072ea9 100644
--- a/torch/utils/_stats.py
+++ b/torch/utils/_stats.py
@@ -3,17 +3,23 @@
 # AND SCRUB AWAY TORCH NOTIONS THERE.
 import collections
 import functools
-from typing import Any, Callable, OrderedDict
+from typing import Callable, OrderedDict, TypeVar
+from typing_extensions import ParamSpec
+
 
 simple_call_counter: OrderedDict[str, int] = collections.OrderedDict()
 
+_P = ParamSpec("_P")
+_R = TypeVar("_R")
+
+
 def count_label(label: str) -> None:
     prev = simple_call_counter.setdefault(label, 0)
     simple_call_counter[label] = prev + 1
 
-def count(fn: Callable[..., Any]) -> Callable[..., Any]:
+def count(fn: Callable[_P, _R]) -> Callable[_P, _R]:
     @functools.wraps(fn)
-    def wrapper(*args: Any, **kwargs: Any) -> Any:
+    def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> _R:
         if fn.__qualname__ not in simple_call_counter:
             simple_call_counter[fn.__qualname__] = 0
         simple_call_counter[fn.__qualname__] = simple_call_counter[fn.__qualname__] + 1
diff --git a/torch/utils/_strobelight/cli_function_profiler.py b/torch/utils/_strobelight/cli_function_profiler.py
index c2e4ae679a9..9502e762cec 100644
--- a/torch/utils/_strobelight/cli_function_profiler.py
+++ b/torch/utils/_strobelight/cli_function_profiler.py
@@ -7,7 +7,8 @@ import re
 import subprocess
 import time
 from threading import Lock
-from typing import Any, List, Optional, Sequence
+from typing import Any, Callable, List, Optional, Sequence, TypeVar
+from typing_extensions import ParamSpec
 
 
 logger = logging.getLogger("strobelight_function_profiler")
@@ -22,6 +23,9 @@ logger.addHandler(console_handler)
 logger.setLevel(logging.INFO)
 logger.propagate = False
 
+_P = ParamSpec("_P")
+_R = TypeVar("_R")
+
 
 class StrobelightCLIProfilerError(Exception):
     """
@@ -246,7 +250,9 @@ class StrobelightCLIFunctionProfiler:
                 self._stop_strobelight_no_throw(collect_results=False)
             return False
 
-    def profile(self, work_function: Any, *args: Any, **kwargs: Any) -> Any:
+    def profile(
+        self, work_function: Callable[_P, _R], *args: _P.args, **kwargs: _P.kwargs
+    ) -> Optional[_R]:
         self.current_run_id = None
 
         if locked := StrobelightCLIFunctionProfiler._lock.acquire(False):
@@ -279,6 +285,7 @@ class StrobelightCLIFunctionProfiler:
                 self._stop_strobelight_no_throw(collect_results=False)
                 StrobelightCLIFunctionProfiler._lock.release()
                 raise error
+        return None
 
 
 # A function decorator that wraps profile, if no profiler is provided one with
@@ -288,13 +295,15 @@ class StrobelightCLIFunctionProfiler:
 # @strobelight(stop_at_error=True,...)
 def strobelight(
     profiler: Optional[StrobelightCLIFunctionProfiler] = None, **kwargs: Any
-) -> Any:
+) -> Callable[[Callable[_P, _R]], Callable[_P, Optional[_R]]]:
     if not profiler:
         profiler = StrobelightCLIFunctionProfiler(**kwargs)
 
-    def strobelight_inner(work_function: Any) -> Any:
+    def strobelight_inner(
+        work_function: Callable[_P, _R]
+    ) -> Callable[_P, Optional[_R]]:
         @functools.wraps(work_function)
-        def wrapper_function(*args: Any, **kwargs: Any) -> Any:
+        def wrapper_function(*args: _P.args, **kwargs: _P.kwargs) -> Optional[_R]:
             return profiler.profile(work_function, *args, **kwargs)
 
         return wrapper_function
diff --git a/torch/utils/_sympy/functions.py b/torch/utils/_sympy/functions.py
index 9e34c11bfe7..cfc77c10f13 100644
--- a/torch/utils/_sympy/functions.py
+++ b/torch/utils/_sympy/functions.py
@@ -4,7 +4,6 @@ import math
 import operator
 import sys
 from typing import (
-    Any,
     Callable,
     Iterable,
     List,
@@ -14,6 +13,7 @@ from typing import (
     TypeVar,
     Union,
 )
+from typing_extensions import TypeVarTuple, Unpack
 
 import sympy
 from sympy import S
@@ -32,6 +32,7 @@ from .numbers import int_oo
 
 
 _T = TypeVar("_T", bound=SupportsFloat)
+_Ts = TypeVarTuple("_Ts")
 
 # Portions of this file are adapted from the Sympy codebase, which was
 # licensed as follows:
@@ -101,9 +102,11 @@ def _is_symbols_binary_summation(expr: sympy.Expr) -> bool:
     )
 
 
-def _keep_float(f: Callable[..., _T]) -> Callable[..., Union[_T, sympy.Float]]:
+def _keep_float(
+    f: Callable[[Unpack[_Ts]], _T]
+) -> Callable[[Unpack[_Ts]], Union[_T, sympy.Float]]:
     @functools.wraps(f)
-    def inner(*args: Any) -> Union[_T, sympy.Float]:
+    def inner(*args: Unpack[_Ts]) -> Union[_T, sympy.Float]:
         r: Union[_T, sympy.Float] = f(*args)
         if any(isinstance(a, sympy.Float) for a in args) and not isinstance(
             r, sympy.Float
diff --git a/torchgen/_autoheuristic/mixed_mm/gen_data_mixed_mm.py b/torchgen/_autoheuristic/mixed_mm/gen_data_mixed_mm.py
index fddcfe81354..48dfa788977 100644
--- a/torchgen/_autoheuristic/mixed_mm/gen_data_mixed_mm.py
+++ b/torchgen/_autoheuristic/mixed_mm/gen_data_mixed_mm.py
@@ -1,13 +1,12 @@
 # mypy: ignore-errors
-import os
 import random
 import sys
-
-
-sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-
+from pathlib import Path
 from typing import Any
 
+
+sys.path.append(str(Path(__file__).absolute().parents[1]))
+
 from benchmark_runner import BenchmarkRunner  # type: ignore[import-not-found]
 from benchmark_utils import (  # type: ignore[import-not-found]
     fits_in_memory,
diff --git a/torchgen/_autoheuristic/mixed_mm/test_mixed_mm.py b/torchgen/_autoheuristic/mixed_mm/test_mixed_mm.py
index 839e7ff87b2..d7add94ec12 100644
--- a/torchgen/_autoheuristic/mixed_mm/test_mixed_mm.py
+++ b/torchgen/_autoheuristic/mixed_mm/test_mixed_mm.py
@@ -1,12 +1,12 @@
-import os
 import sys
 import unittest
-
-
-sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from pathlib import Path
 
 from expecttest import TestCase
 
+
+sys.path.append(str(Path(__file__).absolute().parents[1]))
+
 from test_utils import read_file_to_string, run_bash  # type: ignore[import-not-found]
 
 
diff --git a/torchgen/_autoheuristic/mixed_mm/train_decision_mixedmm.py b/torchgen/_autoheuristic/mixed_mm/train_decision_mixedmm.py
index df96f020dc6..4316d36eaf5 100644
--- a/torchgen/_autoheuristic/mixed_mm/train_decision_mixedmm.py
+++ b/torchgen/_autoheuristic/mixed_mm/train_decision_mixedmm.py
@@ -1,9 +1,9 @@
 # mypy: ignore-errors
-import os
 import sys
+from pathlib import Path
 
 
-sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+sys.path.append(str(Path(__file__).absolute().parents[1]))
 
 from train_decision import AHTrainDecisionTree
 
diff --git a/torchgen/_autoheuristic/mm/gen_data_mm.py b/torchgen/_autoheuristic/mm/gen_data_mm.py
index 4e1e819869a..8ad6dc1c008 100644
--- a/torchgen/_autoheuristic/mm/gen_data_mm.py
+++ b/torchgen/_autoheuristic/mm/gen_data_mm.py
@@ -1,13 +1,12 @@
 import itertools
-import os
 import random
 import sys
-
-
-sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-
+from pathlib import Path
 from typing import Any
 
+
+sys.path.append(str(Path(__file__).absolute().parents[1]))
+
 from benchmark_runner import BenchmarkRunner  # type: ignore[import-not-found]
 from benchmark_utils import (  # type: ignore[import-not-found]
     fits_in_memory,
diff --git a/torchgen/_autoheuristic/mm/train_decision_mm.py b/torchgen/_autoheuristic/mm/train_decision_mm.py
index 945dcc98561..0eecee8146b 100644
--- a/torchgen/_autoheuristic/mm/train_decision_mm.py
+++ b/torchgen/_autoheuristic/mm/train_decision_mm.py
@@ -1,11 +1,11 @@
 # mypy: ignore-errors
-import os
 import sys
+from pathlib import Path
 
 import pandas as pd  # type: ignore[import-untyped]
 
 
-sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+sys.path.append(str(Path(__file__).absolute().parents[1]))
 
 from train_decision import AHTrainDecisionTree
 
diff --git a/torchgen/_autoheuristic/pad_mm/gen_data_pad_mm.py b/torchgen/_autoheuristic/pad_mm/gen_data_pad_mm.py
index 80366bc2fbe..d5ddc44c1b7 100644
--- a/torchgen/_autoheuristic/pad_mm/gen_data_pad_mm.py
+++ b/torchgen/_autoheuristic/pad_mm/gen_data_pad_mm.py
@@ -1,12 +1,11 @@
-import os
 import random
 import sys
-
-
-sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-
+from pathlib import Path
 from typing import Any
 
+
+sys.path.append(str(Path(__file__).absolute().parents[1]))
+
 from benchmark_runner import BenchmarkRunner  # type: ignore[import-not-found]
 from benchmark_utils import (  # type: ignore[import-not-found]
     fits_in_memory,
diff --git a/torchgen/_autoheuristic/pad_mm/test_pad_mm.py b/torchgen/_autoheuristic/pad_mm/test_pad_mm.py
index 6469a6cd37d..dcb7920b83d 100644
--- a/torchgen/_autoheuristic/pad_mm/test_pad_mm.py
+++ b/torchgen/_autoheuristic/pad_mm/test_pad_mm.py
@@ -1,12 +1,12 @@
-import os
 import sys
 import unittest
-
-
-sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from pathlib import Path
 
 from expecttest import TestCase
 
+
+sys.path.append(str(Path(__file__).absolute().parents[1]))
+
 from test_utils import read_file_to_string, run_bash  # type: ignore[import-not-found]
 
 
diff --git a/torchgen/_autoheuristic/pad_mm/train_decision_pad_mm.py b/torchgen/_autoheuristic/pad_mm/train_decision_pad_mm.py
index 9ed37b7a00d..9e37892146f 100644
--- a/torchgen/_autoheuristic/pad_mm/train_decision_pad_mm.py
+++ b/torchgen/_autoheuristic/pad_mm/train_decision_pad_mm.py
@@ -1,9 +1,9 @@
 # mypy: ignore-errors
-import os
 import sys
+from pathlib import Path
 
 
-sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+sys.path.append(str(Path(__file__).absolute().parents[1]))
 
 from train_decision import AHTrainDecisionTree
 
diff --git a/torchgen/_autoheuristic/pad_mm/train_pad_mm.py b/torchgen/_autoheuristic/pad_mm/train_pad_mm.py
index ab60c44dac0..58c4802d497 100644
--- a/torchgen/_autoheuristic/pad_mm/train_pad_mm.py
+++ b/torchgen/_autoheuristic/pad_mm/train_pad_mm.py
@@ -1,9 +1,9 @@
 # mypy: ignore-errors
-import os
 import sys
+from pathlib import Path
 
 
-sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+sys.path.append(str(Path(__file__).absolute().parents[1]))
 
 from train_regression import AHTrainRegressionTree
 
diff --git a/torchgen/_autoheuristic/pad_mm/train_regression_pad_mm.py b/torchgen/_autoheuristic/pad_mm/train_regression_pad_mm.py
index e9cdbf517e0..bf9201e49a2 100644
--- a/torchgen/_autoheuristic/pad_mm/train_regression_pad_mm.py
+++ b/torchgen/_autoheuristic/pad_mm/train_regression_pad_mm.py
@@ -1,9 +1,9 @@
 # mypy: ignore-errors
-import os
 import sys
+from pathlib import Path
 
 
-sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+sys.path.append(str(Path(__file__).absolute().parents[1]))
 
 from train_regression import AHTrainRegressionTree
 
diff --git a/torchgen/gen_backend_stubs.py b/torchgen/gen_backend_stubs.py
index b891c17671f..299a41081da 100644
--- a/torchgen/gen_backend_stubs.py
+++ b/torchgen/gen_backend_stubs.py
@@ -533,7 +533,7 @@ def run(
     source_yaml: str, output_dir: str, dry_run: bool, impl_path: str | None = None
 ) -> None:
     # Assumes that this file lives at PYTORCH_ROOT/torchgen/gen_backend_stubs.py
-    pytorch_root = Path(__file__).parent.parent.absolute()
+    pytorch_root = Path(__file__).absolute().parent.parent
     template_dir = os.path.join(pytorch_root, "aten/src/ATen/templates")
 
     def make_file_manager(install_dir: str) -> FileManager:
diff --git a/torchgen/gen_lazy_tensor.py b/torchgen/gen_lazy_tensor.py
index a15fa62fd1e..e397561d378 100644
--- a/torchgen/gen_lazy_tensor.py
+++ b/torchgen/gen_lazy_tensor.py
@@ -256,7 +256,7 @@ def main() -> None:
     options = parser.parse_args()
 
     # Assumes that this file lives at PYTORCH_ROOT/torchgen/gen_backend_stubs.py
-    torch_root = Path(__file__).parent.parent.parent.absolute()
+    torch_root = Path(__file__).absolute().parents[2]
     aten_path = str(torch_root / "aten" / "src" / "ATen")
     lazy_ir_generator: type[GenLazyIR] = default_args.lazy_ir_generator
     if options.gen_ts_lowerings: