diff --git a/.flake8 b/.flake8
index 9ef3b95fb35..75abc4d1904 100644
--- a/.flake8
+++ b/.flake8
@@ -22,8 +22,9 @@ exclude =
     ./docs/caffe2,
     ./docs/cpp/src,
     ./docs/src,
-    # See NOTE: [Impending functorch move]
-    ./functorch,
+    ./functorch/docs,
+    ./functorch/examples,
+    ./functorch/notebooks,
     ./scripts,
     ./test/generated_type_hints_smoketest.py,
     ./third_party,
diff --git a/.lintrunner.toml b/.lintrunner.toml
index fa5bf458772..0fdd3620f71 100644
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@@ -9,11 +9,9 @@ exclude_patterns = [
     'docs/caffe2/**',
     'docs/cpp/src/**',
     'docs/src/**',
-    # NOTE: [Impending functorch move]
-    # In preparation for the functorch -> pytorch merge,
-    # we are adding the following excludes so that functorch passes
-    # lint when it gets merged in. Please don't delete.
-    'functorch/**',
+    'functorch/docs/**',
+    'functorch/examples/**',
+    'functorch/notebooks/**',
     'scripts/**',
     'test/generated_type_hints_smoketest.py',
     'third_party/**',
@@ -227,8 +225,6 @@ code = 'TYPEIGNORE'
 include_patterns = ['**/*.py', '**/*.pyi']
 exclude_patterns = [
     'test/test_jit.py',
-    # See NOTE: [Impending functorch move]
-    'functorch/**',
 ]
 command = [
     'python3',
@@ -301,8 +297,6 @@ exclude_patterns=[
     'tools/clang_format_hash/**',
     'test/cpp/jit/upgrader_models/*.ptl',
     'test/cpp/jit/upgrader_models/*.ptl.ff',
-    # See NOTE: [Impending functorch move]
-    'functorch/**',
 ]
 command = [
     'python3',
@@ -322,8 +316,6 @@ exclude_patterns = [
     'aten/src/ATen/native/vulkan/api/vk_mem_alloc.h',
     'test/cpp/jit/upgrader_models/*.ptl',
     'test/cpp/jit/upgrader_models/*.ptl.ff',
-    # See NOTE: [Impending functorch move]
-    'functorch/**',
 ]
 command = [
     'python3',
@@ -353,8 +345,6 @@ exclude_patterns = [
     'test/cpp/jit/upgrader_models/*.ptl',
     'test/cpp/jit/upgrader_models/*.ptl.ff',
     '.lintrunner.toml',
-    # See NOTE: [Impending functorch move]
-    'functorch/**',
 ]
 command = [
     'python3',
@@ -436,8 +426,6 @@ exclude_patterns = [
     '**/git-pre-commit',
     '**/git-clang-format',
     '**/gradlew',
-    # See NOTE: [Impending functorch move]
-    'functorch/**',
 ]
 command = [
     'python3',
diff --git a/functorch/.circleci/unittest/windows/scripts/set_cuda_envs.sh b/functorch/.circleci/unittest/windows/scripts/set_cuda_envs.sh
index d1ed415940d..7db3137b594 100644
--- a/functorch/.circleci/unittest/windows/scripts/set_cuda_envs.sh
+++ b/functorch/.circleci/unittest/windows/scripts/set_cuda_envs.sh
@@ -4,7 +4,7 @@ set -ex
 echo CU_VERSION is "${CU_VERSION}"
 echo CUDA_VERSION is "${CUDA_VERSION}"
 
-# Currenly, CU_VERSION and CUDA_VERSION are not consistent. 
+# Currenly, CU_VERSION and CUDA_VERSION are not consistent.
 # to understand this code, see https://github.com/pytorch/vision/issues/4443
 version="cpu"
 if [[ ! -z "${CUDA_VERSION}" ]] ; then
diff --git a/functorch/benchmarks/chrome_trace_parser.py b/functorch/benchmarks/chrome_trace_parser.py
index f07a159a2ab..54d2bf1447f 100755
--- a/functorch/benchmarks/chrome_trace_parser.py
+++ b/functorch/benchmarks/chrome_trace_parser.py
@@ -21,7 +21,7 @@ def get_model_name(filename):
     return modelname
 
 def get_total_length(run_times_df, modelname):
-    return float(run_times_df[run_times_df["name"]==modelname]["runtime"])
+    return float(run_times_df[run_times_df["name"] == modelname]["runtime"])
 
 
 def main():
@@ -51,16 +51,16 @@ def main():
     else:
         print("Please provide a filename or a folder name")
 
-    print(f"modelname, GPU Utilization, MM and Conv time")
+    print("modelname, GPU Utilization, MM and Conv time")
 
-    run_times_df = pd.read_csv(args.runtime)   
+    run_times_df = pd.read_csv(args.runtime)
     for filename in filenames:
         try:
             modelname = get_model_name(filename)
             total_length = get_total_length(run_times_df, modelname) * 1e6
             utilization, mm_conv_utilization = compute_utilization(filenames, total_length)
             print(f"{modelname}, {utilization}, {mm_conv_utilization}")
-        except:
+        except BaseException:
             logging.exception(f"{filename}, ERROR")
             print(f"{filename}, ERROR")
 
diff --git a/functorch/benchmarks/per_sample_grads.py b/functorch/benchmarks/per_sample_grads.py
index c5d911bf36a..e9e3524eca5 100644
--- a/functorch/benchmarks/per_sample_grads.py
+++ b/functorch/benchmarks/per_sample_grads.py
@@ -2,11 +2,8 @@ import torch
 import torch.nn as nn
 import torchvision.models as models
 from opacus.utils.module_modification import convert_batchnorm_modules
-from torchvision.datasets import CIFAR10
 import time
 
-from functools import partial
-import functorch
 from functorch import vmap, grad
 from functorch import make_functional
 from opacus import PrivacyEngine
diff --git a/functorch/benchmarks/process_scorecard.py b/functorch/benchmarks/process_scorecard.py
index 7c17e806ccf..f95d879238a 100644
--- a/functorch/benchmarks/process_scorecard.py
+++ b/functorch/benchmarks/process_scorecard.py
@@ -8,7 +8,7 @@ nops = len(ops)
 pivot_op_shape = df.pivot_table(values="time", index=["operator", "shape"], columns=["fuser"])
 pivot_speedups = (pivot_op_shape.T / pivot_op_shape["eager"]).T
 
-plt.rcParams["figure.figsize"] = (20,100)
+plt.rcParams["figure.figsize"] = (20, 100)
 fig, axs = plt.subplots(nops)
 plt.subplots_adjust(hspace=0.5)
 for idx, op in enumerate(ops):
diff --git a/functorch/benchmarks/transformer_fusion_patterns/benchmark.py b/functorch/benchmarks/transformer_fusion_patterns/benchmark.py
index 1ffb9bcc709..a6646e150c5 100644
--- a/functorch/benchmarks/transformer_fusion_patterns/benchmark.py
+++ b/functorch/benchmarks/transformer_fusion_patterns/benchmark.py
@@ -1,5 +1,4 @@
 import torch
-import time
 from functorch.compile import memory_efficient_fusion, clear_compile_cache
 import benchmark_helper
 
diff --git a/functorch/benchmarks/transformer_fusion_patterns/benchmark_helper.py b/functorch/benchmarks/transformer_fusion_patterns/benchmark_helper.py
index 874b196ec62..bad27572e97 100644
--- a/functorch/benchmarks/transformer_fusion_patterns/benchmark_helper.py
+++ b/functorch/benchmarks/transformer_fusion_patterns/benchmark_helper.py
@@ -63,7 +63,9 @@ def profile_cuda_kernels(fn, args, string_id="Model time"):
     print("################################################\n\n\n\n")
 
 
-def time_with_torch_timer(fn, args, string_id, kwargs={}):
+def time_with_torch_timer(fn, args, string_id, kwargs=None):
+    if kwargs is None:
+        kwargs = {}
     print("################################################")
     print(f"#### Torch Timer for {string_id} starts #########")
     print("################################################")
diff --git a/functorch/benchmarks/transformer_fusion_patterns/bias_gelu_dropout.py b/functorch/benchmarks/transformer_fusion_patterns/bias_gelu_dropout.py
index 8126ec42696..b2318068645 100644
--- a/functorch/benchmarks/transformer_fusion_patterns/bias_gelu_dropout.py
+++ b/functorch/benchmarks/transformer_fusion_patterns/bias_gelu_dropout.py
@@ -1,9 +1,8 @@
 import torch
-import time
 from functorch.compile import memory_efficient_pointwise_fusion, clear_compile_cache
 import benchmark_helper
 
-### ALL comments regarding the patetrns
+# ALL comments regarding the patetrns
 
 
 def bias_gelu_dropout(input, bias):
diff --git a/functorch/docs/source/_static/images/functorch.svg b/functorch/docs/source/_static/images/functorch.svg
index b1ac5cfd4a8..ec7d794122b 100644
--- a/functorch/docs/source/_static/images/functorch.svg
+++ b/functorch/docs/source/_static/images/functorch.svg
@@ -3,4 +3,4 @@
   font-family: Arial Black;
   dominant-baseline: central;
   text-anchor: middle;
-}</style></svg>
\ No newline at end of file
+}</style></svg>
diff --git a/functorch/docs/source/aot_autograd.rst b/functorch/docs/source/aot_autograd.rst
index da3da57f9ab..5123a35485b 100644
--- a/functorch/docs/source/aot_autograd.rst
+++ b/functorch/docs/source/aot_autograd.rst
@@ -40,4 +40,4 @@ Compilers (experimental)
     :nosignatures:
 
     nop
-    ts_compile
\ No newline at end of file
+    ts_compile
diff --git a/functorch/examples/dp_cifar10/cifar10_opacus.py b/functorch/examples/dp_cifar10/cifar10_opacus.py
index b16f2e986e3..bcd0aae8b9d 100644
--- a/functorch/examples/dp_cifar10/cifar10_opacus.py
+++ b/functorch/examples/dp_cifar10/cifar10_opacus.py
@@ -465,4 +465,4 @@ def parse_args():
 
 
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()
diff --git a/functorch/examples/maml_omniglot/.gitignore b/functorch/examples/maml_omniglot/.gitignore
index 1a2aff21feb..783c4e5c7b4 100644
--- a/functorch/examples/maml_omniglot/.gitignore
+++ b/functorch/examples/maml_omniglot/.gitignore
@@ -1,3 +1,2 @@
 omniglot/
 maml-accs.png
-
diff --git a/functorch/functorch/_src/vmap.py b/functorch/functorch/_src/vmap.py
index 98553359aec..1504107a2ca 100644
--- a/functorch/functorch/_src/vmap.py
+++ b/functorch/functorch/_src/vmap.py
@@ -121,7 +121,7 @@ def _create_batched_inputs(
         flat_in_dims: List[Any], flat_args: List[Any], vmap_level: int, args_spec) -> Tuple:
     # See NOTE [Ignored _remove_batch_dim, _add_batch_dim]
     batched_inputs = [arg if in_dim is None else
-                      _add_batch_dim(arg, in_dim, vmap_level)  # type: ignore
+                      _add_batch_dim(arg, in_dim, vmap_level)
                       for in_dim, arg in zip(flat_in_dims, flat_args)]
     return tree_unflatten(batched_inputs, args_spec)
 
diff --git a/functorch/functorch/csrc/BatchRulesDecompositions.cpp b/functorch/functorch/csrc/BatchRulesDecompositions.cpp
index d8d2591d73f..3256847121e 100644
--- a/functorch/functorch/csrc/BatchRulesDecompositions.cpp
+++ b/functorch/functorch/csrc/BatchRulesDecompositions.cpp
@@ -259,4 +259,3 @@ TORCH_LIBRARY_IMPL(aten, FT_BATCHED_KEY, m) {
 }
 
 }}
-
diff --git a/functorch/functorch/csrc/BatchRulesFactory.cpp b/functorch/functorch/csrc/BatchRulesFactory.cpp
index 97d8dafd9c6..a42583c7cf9 100644
--- a/functorch/functorch/csrc/BatchRulesFactory.cpp
+++ b/functorch/functorch/csrc/BatchRulesFactory.cpp
@@ -97,4 +97,3 @@ TORCH_LIBRARY_IMPL(aten, FT_BATCHED_KEY, m) {
   // Not sure how to add the ones with irregular args to the mix cleanly (i.e. randint takes an extra int parameter)
 }
 }}
-
diff --git a/functorch/functorch/csrc/BatchRulesHelper.h b/functorch/functorch/csrc/BatchRulesHelper.h
index 263bcd2c9d5..2bacfeb53d6 100644
--- a/functorch/functorch/csrc/BatchRulesHelper.h
+++ b/functorch/functorch/csrc/BatchRulesHelper.h
@@ -470,4 +470,3 @@ inline VmapDimVector range(int64_t start, int64_t stop) {
 }
 
 }}
-
diff --git a/functorch/functorch/csrc/BatchRulesLinearAlgebra.cpp b/functorch/functorch/csrc/BatchRulesLinearAlgebra.cpp
index 97228e408c2..d7286c55f68 100644
--- a/functorch/functorch/csrc/BatchRulesLinearAlgebra.cpp
+++ b/functorch/functorch/csrc/BatchRulesLinearAlgebra.cpp
@@ -216,4 +216,3 @@ TORCH_LIBRARY_IMPL(aten, FT_BATCHED_KEY, m) {
   VARIADIC_BDIMS_BOXED(_lu_with_info);
 }
 }}
-
diff --git a/functorch/functorch/csrc/BatchRulesRandomness.cpp b/functorch/functorch/csrc/BatchRulesRandomness.cpp
index b4b8b6b1d40..a4a9ef9abcb 100644
--- a/functorch/functorch/csrc/BatchRulesRandomness.cpp
+++ b/functorch/functorch/csrc/BatchRulesRandomness.cpp
@@ -463,7 +463,7 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchVmapMode, m) {
     decltype(&ATEN_FN2(randint_like, low_dtype)), &ATEN_FN2(randint_like, low_dtype), int64_t, int64_t, TENSOR_LIKE_COMMON_ARG_TYPES>);
   m.impl("rand_like", tensor_like_random_batch_rule<decltype(&ATEN_FN(rand_like)), &ATEN_FN(rand_like), TENSOR_LIKE_COMMON_ARG_TYPES>);
   m.impl("randn_like", tensor_like_random_batch_rule<decltype(&ATEN_FN(randn_like)), &ATEN_FN(randn_like), TENSOR_LIKE_COMMON_ARG_TYPES>);
-  
+
   #undef RANDOM_BATCH_RULE
   #undef RANDOM_BATCH_RULE2
   #undef RANDOM_INPLACE_BATCH_RULE
diff --git a/functorch/functorch/csrc/PlumbingHelper.h b/functorch/functorch/csrc/PlumbingHelper.h
index 7a2d4bed167..8a8441c3bb2 100644
--- a/functorch/functorch/csrc/PlumbingHelper.h
+++ b/functorch/functorch/csrc/PlumbingHelper.h
@@ -37,4 +37,3 @@ inline bool ivalueParticipatesInCurrentLevel(const IValue& ivalue) {
 }
 
 }}
-
diff --git a/functorch/notebooks/_src/plot_ensembling.py b/functorch/notebooks/_src/plot_ensembling.py
index 18ddbf589cc..94cd1151ad7 100644
--- a/functorch/notebooks/_src/plot_ensembling.py
+++ b/functorch/notebooks/_src/plot_ensembling.py
@@ -19,7 +19,6 @@ Let's demonstrate how to do this using an ensemble of simple CNNs.
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from functools import partial
 torch.manual_seed(0)
 
 # Here's a simple CNN
diff --git a/functorch/notebooks/_src/plot_jacobians_and_hessians.py b/functorch/notebooks/_src/plot_jacobians_and_hessians.py
index 27e5c449b4a..99db8155683 100644
--- a/functorch/notebooks/_src/plot_jacobians_and_hessians.py
+++ b/functorch/notebooks/_src/plot_jacobians_and_hessians.py
@@ -9,7 +9,6 @@ efficiently using a standard autodiff system like PyTorch Autograd; functorch
 provides ways of computing various higher-order autodiff quantities efficiently.
 """
 import torch
-import torch.nn as nn
 import torch.nn.functional as F
 from functools import partial
 torch.manual_seed(0)
diff --git a/functorch/notebooks/_src/plot_per_sample_gradients.py b/functorch/notebooks/_src/plot_per_sample_gradients.py
index 3c275a65d0e..0feb2b80d94 100644
--- a/functorch/notebooks/_src/plot_per_sample_gradients.py
+++ b/functorch/notebooks/_src/plot_per_sample_gradients.py
@@ -12,7 +12,6 @@ and optimization research.
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from functools import partial
 torch.manual_seed(0)
 
 # Here's a simple CNN
diff --git a/functorch/notebooks/colab/readme.md b/functorch/notebooks/colab/readme.md
index b3feae5ec25..fbdf129da00 100644
--- a/functorch/notebooks/colab/readme.md
+++ b/functorch/notebooks/colab/readme.md
@@ -1,5 +1,5 @@
-### Holds the colab ready versions of the notebook tutorials.  
+### Holds the colab ready versions of the notebook tutorials.
 
 These are similar to the jupyter notebooks, but have additional colab specific changes including the building of functorch in colab to prep for running.
 
-The colabs and notebooks are not auto-synced atm, thus currently updates to one need to be synched to the other. 
+The colabs and notebooks are not auto-synced atm, thus currently updates to one need to be synched to the other.
diff --git a/functorch/writing_batching_rules.md b/functorch/writing_batching_rules.md
index 74ccf739e01..5f571c41708 100644
--- a/functorch/writing_batching_rules.md
+++ b/functorch/writing_batching_rules.md
@@ -96,6 +96,3 @@ There's a couple different resources for finding batching rules to write.
 1. [BatchingRegistrations.cpp](functorch/csrc/BatchingRegistrations.cpp): This is probably the easiest place to start. These were batching rules that were written with an old API, and thus have a lot of cruft in them that are no longer necessary. Porting these batching rules to using one of the above options is an easy way to get started and help us reduce tech debt :) Once you've gotten your footing with writing batching rules, you can start helping with writing new batching rules.
 2. Popular operators. See [1](https://github.com/facebookresearch/functorch/issues/112), [2](https://github.com/facebookresearch/functorch/issues/101), [3](https://github.com/facebookresearch/functorch/issues/102), and [4](https://github.com/facebookresearch/functorch/issues/102). These contain lists of (user-facing) PyTorch operators sorted by usages, along with whether they have a batching rule implemented or not.
 3. [Master List](https://docs.google.com/spreadsheets/d/1Sp4HUjxwMifS5oDQg0yvjqk7hKOpCfKO4jWH4MTGP-k/edit#gid=0). This is the master list of vmap operator support :). It's generated by [this script](op_analysis/gen_data.py). Theoretically, we want to support most of the operators in that list (that aren't composite or out variants).
-
-
-