From 24aa8b9b9ac94a9776add5f81d6e9effcbda18ee Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Tue, 11 Jul 2023 21:48:22 +0000
Subject: [PATCH] Revert "Deprecate registering autograd kernels at not an
 autograd key (#104481)"

This reverts commit ed13ab666419ae5dd3adbdb048c8f96f62b14b3d.

Reverted https://github.com/pytorch/pytorch/pull/104481 on behalf of https://github.com/atalman due to failed in periodic tests ([comment](https://github.com/pytorch/pytorch/pull/104481#issuecomment-1631552846))
---
 .ci/pytorch/test.sh                           |   1 -
 aten/src/ATen/core/VariableFallbackKernel.cpp |  23 +-
 test/autograd/test_fallback.py                | 372 ------------------
 test/test_autograd.py                         |   1 -
 test/test_dispatch.py                         |  30 +-
 torch/_C/__init__.pyi.in                      |   3 -
 .../autograd_not_implemented_fallback.cpp     | 176 ---------
 .../autograd_not_implemented_fallback.h       |  21 -
 torch/csrc/autograd/init.cpp                  |  32 --
 torch/csrc/distributed/c10d/Ops.cpp           |  20 +-
 10 files changed, 28 insertions(+), 651 deletions(-)
 delete mode 100644 test/autograd/test_fallback.py

diff --git a/.ci/pytorch/test.sh b/.ci/pytorch/test.sh
index 15b120e3855..82533dd2615 100755
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@@ -268,7 +268,6 @@ test_dynamo_shard() {
       test_package \
       test_legacy_vmap \
       test_custom_op_testing \
-      test_content_store \
       export/test_db \
       functorch/test_dims \
       functorch/test_aotdispatch \
diff --git a/aten/src/ATen/core/VariableFallbackKernel.cpp b/aten/src/ATen/core/VariableFallbackKernel.cpp
index 88254bb3e5d..d2e82de512e 100644
--- a/aten/src/ATen/core/VariableFallbackKernel.cpp
+++ b/aten/src/ATen/core/VariableFallbackKernel.cpp
@@ -1,7 +1,6 @@
 #include <ATen/core/dispatch/Dispatcher.h>
 #include <ATen/core/LegacyTypeDispatch.h>
 #include <torch/library.h>
-#include <torch/csrc/autograd/autograd_not_implemented_fallback.h>
 
 /*
  * This file implements a variable fallback kernel for custom operators.
@@ -28,38 +27,36 @@ namespace {
 // NB: But not the private use ones; maybe the extension wants
 // to override it themselves!
 
-#define AUTOGRAD_FALLBACK torch::autograd::basicAutogradNotImplementedFallback()
-
 TORCH_LIBRARY_IMPL(_, AutogradOther, m) {
-  m.fallback(AUTOGRAD_FALLBACK);
+  m.fallback(torch::CppFunction::makeFallthrough());
 }
 
 TORCH_LIBRARY_IMPL(_, AutogradCPU, m) {
-  m.fallback(AUTOGRAD_FALLBACK);
+  m.fallback(torch::CppFunction::makeFallthrough());
 }
 
 TORCH_LIBRARY_IMPL(_, AutogradXPU, m) {
-  m.fallback(AUTOGRAD_FALLBACK);
+  m.fallback(torch::CppFunction::makeFallthrough());
 }
 
 TORCH_LIBRARY_IMPL(_, AutogradCUDA, m) {
-  m.fallback(AUTOGRAD_FALLBACK);
+  m.fallback(torch::CppFunction::makeFallthrough());
 }
 
 TORCH_LIBRARY_IMPL(_, AutogradXLA, m) {
-  m.fallback(AUTOGRAD_FALLBACK);
+  m.fallback(torch::CppFunction::makeFallthrough());
 }
 
 TORCH_LIBRARY_IMPL(_, AutogradLazy, m) {
-  m.fallback(AUTOGRAD_FALLBACK);
+  m.fallback(torch::CppFunction::makeFallthrough());
 }
 
 TORCH_LIBRARY_IMPL(_, AutogradMPS, m) {
-  m.fallback(AUTOGRAD_FALLBACK);
+  m.fallback(torch::CppFunction::makeFallthrough());
 }
 
 TORCH_LIBRARY_IMPL(_, AutogradMeta, m) {
-  m.fallback(AUTOGRAD_FALLBACK);
+  m.fallback(torch::CppFunction::makeFallthrough());
 }
 
 // see Note [ADInplaceOrView key]
@@ -68,9 +65,7 @@ TORCH_LIBRARY_IMPL(_, ADInplaceOrView, m) {
 }
 
 TORCH_LIBRARY_IMPL(_, AutogradHPU, m) {
-  m.fallback(AUTOGRAD_FALLBACK);
+  m.fallback(torch::CppFunction::makeFallthrough());
 }
 
-#undef AUTOGRAD_FALLBACK
-
 }
diff --git a/test/autograd/test_fallback.py b/test/autograd/test_fallback.py
deleted file mode 100644
index 415e4f2c10e..00000000000
--- a/test/autograd/test_fallback.py
+++ /dev/null
@@ -1,372 +0,0 @@
-# Owner(s): ["module: autograd"]
-
-import torch
-from torch.library import Library
-from torch.testing._internal.common_utils import (
-    TestCase,
-    parametrize,
-    instantiate_parametrized_tests,
-    run_tests,
-)
-import contextlib
-import numpy as np
-import warnings
-
-@contextlib.contextmanager
-def autograd_fallback_mode(mode):
-    prev = torch._C._get_autograd_fallback_mode()
-    try:
-        torch._C._set_autograd_fallback_mode(mode)
-        yield
-    finally:
-        torch._C._set_autograd_fallback_mode(prev)
-
-class TestAutogradFallback(TestCase):
-    test_ns = '_test_autograd_fallback'
-
-    def tearDown(self):
-        if hasattr(torch.ops, self.test_ns):
-            delattr(torch.ops, self.test_ns)
-        if hasattr(self, 'lib'):
-            del self.lib.m
-            del self.lib
-
-    def get_op(self, name):
-        return getattr(getattr(torch.ops, self.test_ns), name).default
-
-    def get_lib(self):
-        lib = Library(self.test_ns, "FRAGMENT")
-        self.lib = lib
-        return lib
-
-    @parametrize("mode", ("nothing", "warn"))
-    def test_no_grad(self, mode):
-        with autograd_fallback_mode(mode):
-            lib = self.get_lib()
-            lib.define("foo(Tensor a, Tensor b, int c) -> Tensor")
-            lib.impl("foo", lambda a, b, c: a + b + c, "CPU")
-            op = self.get_op("foo")
-
-            with warnings.catch_warnings():
-                warnings.simplefilter("error")
-                with torch.no_grad():
-                    a = torch.randn([], requires_grad=True)
-                    b = torch.randn([], requires_grad=True)
-                    out = op(a, b, 1)
-                self.assertFalse(out.requires_grad)
-
-            with warnings.catch_warnings():
-                warnings.simplefilter("error")
-                a = torch.randn([])
-                b = torch.randn([])
-                out = op(a, b, 1)
-                self.assertFalse(out.requires_grad)
-
-    @parametrize("mode", ("nothing", "warn"))
-    def test_no_autograd_kernel(self, mode):
-        with autograd_fallback_mode(mode):
-            lib = self.get_lib()
-            lib.define("foo(Tensor a, Tensor b, int c) -> Tensor")
-            op = self.get_op("foo")
-
-            def foo_impl(a, b, c):
-                result = a.detach().numpy() + b.detach().numpy() + c
-                return torch.tensor(result)
-
-            lib.impl("foo", foo_impl, "CPU")
-
-            # Some inputs requiring grad
-            a = torch.randn([], requires_grad=False)
-            b = torch.randn([], requires_grad=True)
-            out = op(a, b, 1).sum()
-            with self._check_ctx(mode, mode_nothing_raises=True):
-                out.backward()
-            self.assertIsNone(b.grad)
-
-    def _check_ctx(self, mode, *, mode_nothing_raises=False):
-        if mode == "warn":
-            return self.assertWarnsRegex(UserWarning, 'an autograd kernel was not registered')
-        assert mode == "nothing"
-        if mode_nothing_raises:
-            return self.assertRaisesRegex(RuntimeError, "does not require grad")
-        return contextlib.nullcontext()
-
-    @parametrize("mode", ("nothing", "warn"))
-    def test_no_autograd_kernel_inplace(self, mode):
-        with autograd_fallback_mode(mode):
-            # input modified in-place gets returned as output
-            lib = self.get_lib()
-            lib.define("foo(Tensor(a!) self, Tensor(b!) y) -> (Tensor(a!), Tensor(b!))")
-            op = self.get_op("foo")
-
-            def foo_impl(x, y):
-                with torch.no_grad():
-                    x.sin_()
-                    y.cos_()
-                return x, y
-
-            lib.impl("foo", foo_impl, "CPU")
-
-            x = torch.randn(3, requires_grad=True)
-            w = x.clone()
-            v = x.clone()
-            y0 = w[0]
-            y1 = v[1]
-            z0, z1 = op(y0, y1)
-            for tensor in [w, v, z0, z1, y0, y1]:
-                with self._check_ctx(mode):
-                    tensor.sum().backward(retain_graph=True)
-
-            # no outputs: we don't do anything. Maybe we should in the future.
-            # This is not a common failure mode.
-            lib.define("bar(Tensor(a!) self) -> ()")
-            op = self.get_op("bar")
-
-            def bar_impl(x):
-                with torch.no_grad():
-                    x.sin_()
-
-            lib.impl("bar", bar_impl, "CPU")
-            with warnings.catch_warnings():
-                warnings.simplefilter("error")
-                x = torch.randn([], requires_grad=True)
-                y = x.clone()
-                z = op(y)
-                y.backward()
-                self.assertEqual(x.grad, torch.ones_like(x))
-
-    @parametrize("mode", ("nothing", "warn"))
-    def test_cpu_return_self(self, mode):
-        with autograd_fallback_mode(mode):
-            # To be clear, none of these situations are OK and will lead
-            # to other problems down the line. We're testing them because
-            # it is fairly common to actually do these things.
-            lib = Library(self.test_ns, "FRAGMENT")
-            lib.define("foo(Tensor self) -> Tensor")
-            lib.impl("foo", lambda x: x, "CPU")
-            op = self.get_op("foo")
-
-            x = torch.randn(3, requires_grad=True)
-            y = op(x).sum()
-            with self._check_ctx(mode):
-                y.backward()
-                self.assertEqual(x.grad, torch.ones_like(x))
-
-            lib.define("bar(Tensor(a!) self) -> Tensor(a!)")
-            lib.impl("bar", lambda x: x, "CPU")
-            op = self.get_op("bar")
-
-            x = torch.randn(3, requires_grad=True)
-            y = op(x).sum()
-            with self._check_ctx(mode):
-                y.backward()
-                self.assertEqual(x.grad, torch.ones_like(x))
-
-    @parametrize("mode", ("nothing", "warn"))
-    def test_composite_registered_to_cpu(self, mode):
-        with autograd_fallback_mode(mode):
-            lib = Library(self.test_ns, "FRAGMENT")
-            lib.define("foo(Tensor self) -> Tensor")
-            lib.impl("foo", lambda x: x.sin().sum(), "CPU")
-            op = self.get_op("foo")
-
-            x = torch.randn(3, requires_grad=True)
-            y = op(x)
-            with self._check_ctx(mode):
-                y.backward()
-                self.assertEqual(x.grad, x.cos())
-
-    @parametrize("mode", ("nothing", "warn"))
-    def test_autograd_function_registered_to_cpu(self, mode):
-        with autograd_fallback_mode(mode):
-            lib = Library(self.test_ns, "FRAGMENT")
-            lib.define("foo(Tensor self) -> Tensor")
-
-            class NumpySin(torch.autograd.Function):
-                @staticmethod
-                def forward(ctx, x):
-                    ctx.save_for_backward(x)
-                    return torch.tensor(np.sin(x.cpu().numpy()))
-
-                @staticmethod
-                def backward(ctx, gx):
-                    x, = ctx.saved_tensors
-                    return gx * x.cos()
-
-            lib.impl("foo", NumpySin.apply, "CPU")
-            op = self.get_op("foo")
-
-            x = torch.randn(3, requires_grad=True)
-            y = op(x).sum()
-            with self._check_ctx(mode):
-                y.backward()
-                self.assertEqual(x.grad, x.cos())
-
-    @parametrize("mode", ("nothing", "warn"))
-    def test_inplace_autograd_function_registered_to_cpu(self, mode):
-        with autograd_fallback_mode(mode):
-            lib = Library(self.test_ns, "FRAGMENT")
-            lib.define("foo(Tensor(a!) self) -> Tensor(a!)")
-
-            class NumpySin_(torch.autograd.Function):
-                @staticmethod
-                def forward(ctx, x):
-                    ctx.save_for_backward(x.clone())
-                    x_np = x.detach().numpy()
-                    np.sin(x_np, out=x_np)
-                    ctx.mark_dirty(x)
-                    return x
-
-                @staticmethod
-                def backward(ctx, gx):
-                    x, = ctx.saved_tensors
-                    return gx * x.cos()
-
-            lib.impl("foo", NumpySin_.apply, "CPU")
-            op = self.get_op("foo")
-
-            x = torch.randn(3, requires_grad=True)
-            z = x.clone()
-            w = z[0]
-            y = op(w)
-
-            expected = torch.zeros_like(x)
-            expected[0] = x[0].cos()
-            with self._check_ctx(mode):
-                gx, = torch.autograd.grad(y, x, torch.ones_like(y), retain_graph=True)
-                self.assertEqual(gx, expected)
-
-            expected = torch.ones_like(x)
-            expected[0] = x[0].cos()
-            with self._check_ctx(mode):
-                gx, = torch.autograd.grad(z, x, torch.ones_like(z))
-                self.assertEqual(gx, expected)
-
-    @parametrize("mode", ("nothing", "warn"))
-    def test_inplace_on_tensor_that_does_not_require_grad(self, mode):
-        # We don't do anything special (that is, we don't rebase history).
-        # See NOTE [autograd fallback and in-place operations] for why
-        with autograd_fallback_mode(mode):
-            lib = Library(self.test_ns, "FRAGMENT")
-
-            # Correct usage of (a!)
-            lib.define("foo(Tensor(a!) self, Tensor other) -> Tensor(a!)")
-
-            def foo_impl(x, y):
-                x_d = x.detach()
-                y = y.detach()
-                x_d.add_(y)
-                return x
-
-            lib.impl("foo", foo_impl, "CPU")
-            foo = self.get_op("foo")
-
-            # Incorrect usage of (a!): user doesn't return tensor as-is
-            lib.define("bar(Tensor(a!) self, Tensor other) -> Tensor(a!)")
-
-            def bar_impl(x, y):
-                x_d = x.detach()
-                y = y.detach()
-                x_d.add_(y)
-                return x_d.clone()
-
-            lib.impl("bar", bar_impl, "CPU")
-            bar = self.get_op("bar")
-
-            # User mutated input tensor but didn't return it.
-            lib.define("baz(Tensor(a!) self, Tensor other) -> ()")
-
-            def baz_impl(x, y):
-                x_d = x.detach()
-                y = y.detach()
-                x_d.add_(y)
-
-            lib.impl("baz", baz_impl, "CPU")
-            baz = self.get_op("baz")
-
-            # Test in-place on non-view
-            for op in (foo, bar, baz):
-                x = torch.randn(3)
-                y = torch.randn(3, requires_grad=True)
-                with self.assertRaisesRegex(RuntimeError, "does not require grad"):
-                    z = x.clone()
-                    op(z, y)
-                    torch.autograd.grad(z, y, torch.ones_like(z), allow_unused=True)
-
-            # Test in-place on view
-            for op in (foo, bar, baz):
-                x = torch.randn(3)
-                y = torch.randn(3, requires_grad=True)
-                with self.assertRaisesRegex(RuntimeError, "does not require grad"):
-                    z = x[:]
-                    op(z, y)
-                    torch.autograd.grad(z, x, torch.ones_like(z), allow_unused=True)
-
-    @parametrize("mode", ("nothing", "warn"))
-    def test_post_autograd_returns_leaf(self, mode):
-        with autograd_fallback_mode(mode):
-            lib = self.get_lib()
-            lib.define("foo(Tensor a) -> (Tensor, Tensor)")
-            op = self.get_op("foo")
-
-            lib.impl("foo", lambda a: (a.clone(), a.clone().detach().requires_grad_()), "CPU")
-            x = torch.randn(3, requires_grad=True)
-            y, z = op(x)
-            with self._check_ctx(mode):
-                z.sum().backward()
-
-    @parametrize("mode", ("nothing", "warn"))
-    def test_post_autograd_returns_mix_of_requires_grad_tensors(self, mode):
-        with autograd_fallback_mode(mode):
-            lib = self.get_lib()
-            lib.define("foo(Tensor a, Tensor b) -> (Tensor, Tensor, Tensor)")
-            op = self.get_op("foo")
-
-            def foo_impl(a, b):
-                with torch.no_grad():
-                    x = a.clone()
-                    z = b.clone()
-                y = a * b
-                return x, y, z
-
-            lib.impl("foo", foo_impl, "CPU")
-            a = torch.randn(3, requires_grad=True)
-            b = torch.randn(3, requires_grad=True)
-            x, y, z = op(a, b)
-
-            with self._check_ctx(mode, mode_nothing_raises=True):
-                torch.autograd.grad(x, (a, b), torch.ones_like(x), allow_unused=True, retain_graph=True)
-
-            with self._check_ctx(mode, mode_nothing_raises=False):
-                torch.autograd.grad(y, (a, b), torch.ones_like(y), allow_unused=True, retain_graph=True)
-
-            with self._check_ctx(mode, mode_nothing_raises=True):
-                torch.autograd.grad(z, (a, b), torch.ones_like(z), allow_unused=True, retain_graph=True)
-
-    @parametrize("mode", ("nothing", "warn"))
-    def test_supports_tensor_lists(self, mode):
-        with autograd_fallback_mode(mode):
-            lib = self.get_lib()
-            lib.define("foo(Tensor[] a) -> Tensor[]")
-            op = self.get_op("foo")
-
-            def foo_impl(a):
-                x, y, z = a
-                with torch.no_grad():
-                    return x + y + z, x * y * z
-
-            lib.impl("foo", foo_impl, "CPU")
-            x = torch.randn(3, requires_grad=True)
-            y = torch.randn(1, requires_grad=True)
-            z = torch.randn(2, 1, requires_grad=True)
-            a, b = op([x, y, z])
-            with self._check_ctx(mode, mode_nothing_raises=True):
-                torch.autograd.grad(a, (x, y, z), torch.ones_like(a), allow_unused=True, retain_graph=True)
-            with self._check_ctx(mode, mode_nothing_raises=True):
-                torch.autograd.grad(b, (x, y, z), torch.ones_like(b), allow_unused=True, retain_graph=True)
-
-
-instantiate_parametrized_tests(TestAutogradFallback)
-
-if __name__ == '__main__':
-    run_tests()
diff --git a/test/test_autograd.py b/test/test_autograd.py
index 767d849691e..82b6af455db 100644
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@@ -11267,7 +11267,6 @@ class TestAutogradMultipleDispatch(TestCase):
 
 from autograd.test_complex import TestAutogradComplex  # noqa: F401
 from autograd.test_functional import TestAutogradFunctional  # noqa: F401
-from autograd.test_fallback import TestAutogradFallback  # noqa: F401
 
 # e.g., TestAutogradDeviceTypeCPU and TestAutogradDeviceTypeCUDA
 instantiate_device_type_tests(
diff --git a/test/test_dispatch.py b/test/test_dispatch.py
index 0fdb92ba0da..1b4e0411943 100644
--- a/test/test_dispatch.py
+++ b/test/test_dispatch.py
@@ -391,7 +391,7 @@ CPU: impl_t_t [kernel]
 CUDA: default_def_name_t_t [math kernel]
 XLA: default_def_name_t_t [math kernel]
 AutogradOther: default_def_name_t_t [math kernel]
-AutogradCPU: registered in pytorch framework [backend fallback]
+AutogradCPU: fallthrough registered in pytorch framework [backend fallback]
 AutogradCUDA: default_def_name_t_t [math kernel]
 AutogradXLA: default_def_name_t_t [math kernel]
 ''')
@@ -456,7 +456,7 @@ CPU: fn_cpu [kernel]
 CUDA: fn_math [math kernel]
 XLA: fn_math [math kernel]
 AutogradOther: fn_math [math kernel]
-AutogradCPU: registered in pytorch framework [backend fallback]
+AutogradCPU: fallthrough registered in pytorch framework [backend fallback]
 AutogradCUDA: fn_math [math kernel]
 AutogradXLA: fn_math [math kernel]
 ''')
@@ -587,10 +587,10 @@ Undefined: fn_defaultbackend [default backend kernel]
 CPU: fn_cpu [kernel]
 CUDA: fn_defaultbackend [default backend kernel]
 XLA: fn_defaultbackend [default backend kernel]
-AutogradOther: registered in pytorch framework [backend fallback]
-AutogradCPU: registered in pytorch framework [backend fallback]
-AutogradCUDA: registered in pytorch framework [backend fallback]
-AutogradXLA: registered in pytorch framework [backend fallback]
+AutogradOther: fallthrough registered in pytorch framework [backend fallback]
+AutogradCPU: fallthrough registered in pytorch framework [backend fallback]
+AutogradCUDA: fallthrough registered in pytorch framework [backend fallback]
+AutogradXLA: fallthrough registered in pytorch framework [backend fallback]
 ''')
 
     def test_computed_table_with_cpu_autograd_defaultbackend(self):
@@ -814,9 +814,9 @@ XLA             fn_XLA [kernel]
 Lazy            fn_Lazy [kernel]
 FPGA            fn_CompositeImplicitAutograd [math kernel]
 AutogradOther   fn_CompositeImplicitAutograd [math kernel]
-AutogradCPU     [backend fallback]
-AutogradXLA     [backend fallback]
-AutogradLazy    [backend fallback]
+AutogradCPU     fallthrough [backend fallback]
+AutogradXLA     fallthrough [backend fallback]
+AutogradLazy    fallthrough [backend fallback]
 '''
         )
 
@@ -836,8 +836,8 @@ Lazy            fn_Lazy [kernel]
 FPGA            fn_CompositeImplicitAutograd [math kernel]
 AutogradOther   fn_CompositeImplicitAutograd [math kernel]
 AutogradCPU     fn_AutogradCPU [kernel]
-AutogradXLA     [backend fallback]
-AutogradLazy    [backend fallback]
+AutogradXLA     fallthrough [backend fallback]
+AutogradLazy    fallthrough [backend fallback]
 '''
         )
         self.assertExpectedInline(
@@ -869,10 +869,10 @@ CPU             fn_CPU [kernel]
 XLA             fn_XLA [kernel]
 Lazy            fn_Lazy [kernel]
 FPGA            fn_CompositeExplicitAutograd [default backend kernel]
-AutogradOther   [backend fallback]
+AutogradOther   fallthrough [backend fallback]
 AutogradCPU     fn_AutogradCPU [kernel]
-AutogradXLA     [backend fallback]
-AutogradLazy    [backend fallback]
+AutogradXLA     fallthrough [backend fallback]
+AutogradLazy    fallthrough [backend fallback]
 '''
         )
 
@@ -906,7 +906,7 @@ XLA             fn_CompositeImplicitAutograd [math kernel]
 Lazy            fn_CompositeImplicitAutograd [math kernel]
 FPGA            fn_FPGA [kernel]
 AutogradOther   ambiguous_autogradother [ambiguous autogradother]
-AutogradCPU     [backend fallback]
+AutogradCPU     fallthrough [backend fallback]
 AutogradXLA     fn_CompositeImplicitAutograd [math kernel]
 AutogradLazy    fn_CompositeImplicitAutograd [math kernel]
 '''
diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in
index 3abd9db3c41..de2022ad539 100644
--- a/torch/_C/__init__.pyi.in
+++ b/torch/_C/__init__.pyi.in
@@ -1229,9 +1229,6 @@ class _InferenceMode:
     def __enter__(self): ...
     def __exit__(self, exc_type, exc_value, traceback): ...
 
-def _set_autograd_fallback_mode(mode: str) -> None: ...
-def _get_autograd_fallback_mode() -> str: ...
-
 # Defined in torch/csrc/jit/python/script_init.cpp
 class LoggerBase: ...
 class NoopLogger(LoggerBase): ...
diff --git a/torch/csrc/autograd/autograd_not_implemented_fallback.cpp b/torch/csrc/autograd/autograd_not_implemented_fallback.cpp
index c81108b2421..5c9e7753124 100644
--- a/torch/csrc/autograd/autograd_not_implemented_fallback.cpp
+++ b/torch/csrc/autograd/autograd_not_implemented_fallback.cpp
@@ -45,184 +45,8 @@ void _foreach_tensor(
   }
 }
 
-AutogradFallbackMode kAutogradFallbackMode = AutogradFallbackMode::Warn;
-
 } // namespace
 
-void setAutogradFallbackMode(AutogradFallbackMode mode) {
-  TORCH_CHECK(mode != AutogradFallbackMode::Error, "NYI: mode='error'");
-  kAutogradFallbackMode = mode;
-}
-
-AutogradFallbackMode getAutogradFallbackMode() {
-  return kAutogradFallbackMode;
-}
-
-static void warnAutogradNotImplemented(const std::string& op_name) {
-  TORCH_WARN(
-      op_name,
-      ": an autograd kernel was not registered to the Autograd key(s) ",
-      "but we are trying to backprop through it. This may lead to silently incorrect behavior. ",
-      "This behavior is deprecated and will be removed in a future version of PyTorch. ",
-      "If your operator is differentiable, please ensure you have registered an "
-      "autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, "
-      "DispatchKey::CompositeImplicitAutograd). If your operator is not "
-      "differentiable, or to squash this warning and use the previous behavior, "
-      "please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd.");
-}
-
-struct WarnNotImplemented : public Node {
-  WarnNotImplemented(
-      std::string op_name,
-      int64_t num_outputs,
-      edge_list&& next_edges)
-      : Node(std::move(next_edges)),
-        op_name(std::move(op_name)),
-        num_outputs(num_outputs) {}
-
-  WarnNotImplemented(std::string op_name, int64_t num_outputs)
-      : op_name(std::move(op_name)), num_outputs(num_outputs) {}
-
-  variable_list apply(variable_list&& inputs) override;
-
-  std::string op_name;
-  int64_t num_outputs;
-};
-
-auto WarnNotImplemented::apply(variable_list&& inputs) -> variable_list {
-  warnAutogradNotImplemented(op_name);
-  std::vector<at::Tensor> output(num_outputs);
-  return output;
-}
-
-static void basicAutogradNotImplementedFallbackImpl(
-    const c10::OperatorHandle& op,
-    c10::DispatchKeySet dispatch_keys,
-    torch::jit::Stack* stack) {
-  const auto& schema = op.schema();
-  const auto& op_name = schema.operator_name().name;
-  const auto num_arguments = schema.arguments().size();
-  const auto num_returns = schema.returns().size();
-  const auto stack_start = stack->size() - num_arguments;
-  const bool grad_mode = GradMode::is_enabled();
-
-  if (getAutogradFallbackMode() == AutogradFallbackMode::Nothing) {
-    op.redispatchBoxed(dispatch_keys & c10::after_autograd_keyset, stack);
-    return;
-  }
-  TORCH_INTERNAL_ASSERT(
-      getAutogradFallbackMode() == AutogradFallbackMode::Warn);
-
-  bool any_input_requires_grad = false;
-  if (grad_mode) {
-    _foreach_tensor(
-        [&](size_t _, size_t idx_arg, const at::Tensor& t) {
-          if (t.requires_grad()) {
-            any_input_requires_grad = true;
-          }
-        },
-        stack,
-        stack_start,
-        num_arguments);
-  }
-
-  std::shared_ptr<WarnNotImplemented> grad_fn;
-  if (any_input_requires_grad) {
-    // NB: It is standard to collect edges from all tensors
-    // (see generated/VariableTypeEverything.cpp for examples)
-    std::vector<const at::Tensor*> all_tensors_on_stack;
-    _foreach_tensor(
-        [&](size_t _, size_t idx_arg, const at::Tensor& t) {
-          all_tensors_on_stack.push_back(&t);
-        },
-        stack,
-        stack_start,
-        num_arguments);
-    grad_fn = std::shared_ptr<WarnNotImplemented>(
-        new WarnNotImplemented(op_name, all_tensors_on_stack.size()),
-        deleteNode);
-    grad_fn->set_next_edges(collect_next_edges(all_tensors_on_stack));
-  }
-
-  op.redispatchBoxed(dispatch_keys & c10::after_autograd_keyset, stack);
-
-  if (any_input_requires_grad) {
-    // NB: if the operator mutates any inputs in-place and does not return them
-    // as outputs, we are unable to lazily raise a warning. This is OK because
-    // we don't expect many existing operators to do this because of the amount
-    // of technical expertise necessary (you would need to manually register an
-    // autograd kernel without using autograd.Function)
-    _foreach_tensor(
-        [&](size_t _, size_t idx_ret, const at::Tensor& t) {
-          if (!isDifferentiableType(t.scalar_type())) {
-            return;
-          }
-          const bool is_mutable_output =
-              schema.is_aliasing({c10::SchemaArgType::output, idx_ret}) &&
-              schema.is_mutable({c10::SchemaArgType::output, idx_ret});
-
-          // If the post-autograd implementation returns Tensors that require
-          // grad, then we install a hook that will warn during the backwards.
-          //
-          // NB: If the operation is inplace and the inputs were views,
-          // it is possible that the history was rebased and the hook will
-          // not warn in all places where it should. That is, the following
-          // won't warn:
-          // >>> x = torch.randn(3, 3, requires_grad=True)
-          // >>> z = x.clone()
-          // >>> w = z[0]
-          // >>> k = w[0]
-          // >>> y = op(k)
-          // >>> torch.autograd.grad(z.sum(), w)
-          if (t.requires_grad()) {
-            t.register_hook([op_name](const at::Tensor& grad) {
-              warnAutogradNotImplemented(op_name);
-              return grad;
-            });
-            // If history is rebased, then we will attempt to warn
-            // on the view's base. This will catch most cases (because
-            // users typically call .backward() and backprop through
-            // the entire program).
-            if (t.is_view() && is_mutable_output) {
-              // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
-              const_cast<at::TensorBase&>(t._base()).register_hook(
-                  [op_name](const at::TensorBase& grad) {
-                    warnAutogradNotImplemented(op_name);
-                    return grad;
-                  });
-            }
-            return;
-          }
-
-          // If the post-autograd implementation returns any Tensors that
-          // don't require grad, then we install the WarnNotImplemented grad_fn.
-          // This grad_fn warns in backward and returns undefined tensor
-          // gradients.
-          //
-          // NOTE [autograd fallback and in-place operations]
-          // If the schema says the output is mutable, and the output
-          // is an input, and the input is a view Tensor, then...
-          // we're not sure if set_history is OK to do, so we just skip
-          // adding the grad_fn. Builtin operators do rebase_history here,
-          // but custom operators may have multiple Tensor(a!) returns,
-          // rebase_history assumes single Tensor(a!) return, and in general
-          // custom ops don't have a good in-place story.
-          if (!is_mutable_output) {
-            // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
-            set_history(const_cast<at::Tensor&>(t), grad_fn);
-          }
-        },
-        stack,
-        stack->size() - num_returns,
-        num_returns);
-  }
-}
-
-torch::CppFunction basicAutogradNotImplementedFallback() {
-  return torch::CppFunction::makeFromBoxedFunction<
-      &basicAutogradNotImplementedFallbackImpl>();
-}
-
 static void autogradNotImplementedFallbackImpl(
     const c10::OperatorHandle& op,
     c10::DispatchKeySet dispatch_keys,
diff --git a/torch/csrc/autograd/autograd_not_implemented_fallback.h b/torch/csrc/autograd/autograd_not_implemented_fallback.h
index 5e1e37b4a46..db9cfb6c007 100644
--- a/torch/csrc/autograd/autograd_not_implemented_fallback.h
+++ b/torch/csrc/autograd/autograd_not_implemented_fallback.h
@@ -5,30 +5,9 @@
 namespace torch {
 namespace autograd {
 
-// Default DispatchKey::Autograd fallback for built-in operators.
-// Can be registered for custom operators.
 TORCH_API torch::CppFunction autogradNotImplementedFallback();
 
-// Default DispatchKey::AdInplaceOrView fallback for built-in operators
-// Can be registered for custom operators.
 TORCH_API torch::CppFunction autogradNotImplementedInplaceOrViewFallback();
 
-// Default DispatchKey::Autograd fallback for all other operators (i.e. custom
-// operators)
-TORCH_API torch::CppFunction basicAutogradNotImplementedFallback();
-
-enum class AutogradFallbackMode {
-  Nothing, // Fallback is a redispatch
-  Warn, // Fallback raises a warning if backward is called
-  Error, // Fallback raises an error if backward is called
-};
-
-// Change the behavior of "basicAutogradNotImplementedFallback"
-// In Python this is:
-// - torch._C._set_autograd_fallback_mode(str) -> None
-// - torch._C._get_autograd_fallback_mode() -> str
-TORCH_API void setAutogradFallbackMode(AutogradFallbackMode mode);
-TORCH_API AutogradFallbackMode getAutogradFallbackMode();
-
 } // namespace autograd
 } // namespace torch
diff --git a/torch/csrc/autograd/init.cpp b/torch/csrc/autograd/init.cpp
index dc304242439..a3e65c15cb1 100644
--- a/torch/csrc/autograd/init.cpp
+++ b/torch/csrc/autograd/init.cpp
@@ -12,7 +12,6 @@
 #include <torch/csrc/Exceptions.h>
 #include <torch/csrc/autograd/VariableTypeUtils.h>
 #include <torch/csrc/autograd/autograd.h>
-#include <torch/csrc/autograd/autograd_not_implemented_fallback.h>
 #include <torch/csrc/autograd/function.h>
 #include <torch/csrc/autograd/grad_mode.h>
 #include <torch/csrc/autograd/profiler.h>
@@ -409,37 +408,6 @@ PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject* unused) {
         auto cls = python_type_class.ptr();
         registerPythonTensorClass(device, cls);
       });
-  _C_m.def("_set_autograd_fallback_mode", [](const std::string& mode) {
-    if (mode == "nothing") {
-      torch::autograd::setAutogradFallbackMode(
-          torch::autograd::AutogradFallbackMode::Nothing);
-      return;
-    }
-    if (mode == "warn") {
-      torch::autograd::setAutogradFallbackMode(
-          torch::autograd::AutogradFallbackMode::Warn);
-      return;
-    }
-    if (mode == "error") {
-      torch::autograd::setAutogradFallbackMode(
-          torch::autograd::AutogradFallbackMode::Error);
-      return;
-    }
-    TORCH_INTERNAL_ASSERT(false, "Unsupported AutogradFallbackMode: ", mode);
-  });
-  _C_m.def("_get_autograd_fallback_mode", []() {
-    auto mode = torch::autograd::getAutogradFallbackMode();
-    switch (mode) {
-      case torch::autograd::AutogradFallbackMode::Nothing:
-        return "nothing";
-      case torch::autograd::AutogradFallbackMode::Warn:
-        return "warn";
-      case torch::autograd::AutogradFallbackMode::Error:
-        return "error";
-      default:
-        TORCH_INTERNAL_ASSERT(false, "Unsupported AutogradFallbackMode");
-    }
-  });
 
   _C_m.def("_activate_cuda_trace", []() { activateCUDATrace(); });
 
diff --git a/torch/csrc/distributed/c10d/Ops.cpp b/torch/csrc/distributed/c10d/Ops.cpp
index e9640304eab..db7c4986b23 100644
--- a/torch/csrc/distributed/c10d/Ops.cpp
+++ b/torch/csrc/distributed/c10d/Ops.cpp
@@ -485,19 +485,11 @@ namespace {
     m.impl(#FUNC, FUNC##DEV);        \
   }
 
-#define REGISTER_C10D_FALLTHROUGH(FUNC, KEY)              \
-  TORCH_LIBRARY_IMPL(c10d, KEY, m) {                      \
-    m.impl(#FUNC, torch::CppFunction::makeFallthrough()); \
-  }
-
 // 1st level expansion
-#define REGISTER_C10D_OP(FUNC)                  \
-  REGISTER_C10D_OP1(FUNC, CPU)                  \
-  REGISTER_C10D_FALLTHROUGH(FUNC, AutogradCPU)  \
-  REGISTER_C10D_OP1(FUNC, CUDA)                 \
-  REGISTER_C10D_FALLTHROUGH(FUNC, AutogradCUDA) \
-  REGISTER_C10D_OP1(FUNC, PrivateUse1)          \
-  REGISTER_C10D_FALLTHROUGH(FUNC, AutogradPrivateUse1)
+#define REGISTER_C10D_OP(FUNC)  \
+  REGISTER_C10D_OP1(FUNC, CPU)  \
+  REGISTER_C10D_OP1(FUNC, CUDA) \
+  REGISTER_C10D_OP1(FUNC, PrivateUse1)
 
 // Now we start to register ops with the three device keys
 
@@ -527,10 +519,6 @@ TORCH_LIBRARY_IMPL(c10d, CPU, m) {
   m.impl("monitored_barrier_", monitored_barrier_CPU);
 }
 
-TORCH_LIBRARY_IMPL(c10d, AutogradCPU, m) {
-  m.impl("monitored_barrier_", torch::CppFunction::makeFallthrough());
-}
-
 // TODO: The SparseCPU/SparseCUDA dispatched methods are only used to support
 // sparse all_reduce in the Gloo backend
 TORCH_LIBRARY_IMPL(c10d, SparseCPU, m) {