From 87242d2393119990ebe9043e854317f02536bdff Mon Sep 17 00:00:00 2001
From: Alexander <aocsa.cs@gmail.com>
Date: Sun, 2 May 2021 22:10:29 -0700
Subject: [PATCH] Eliminate global usage of torch.set_default_dtype in
 test_autograd (#56446)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/56446

Test Plan: Imported from OSS

Reviewed By: ngimel

Differential Revision: D28000589

Pulled By: mruberry

fbshipit-source-id: c8fb2907d656138e72ecf8fb3e572591f8972900
---
 test/test_autograd.py                         | 258 +++++++++---------
 .../_internal/common_methods_invocations.py   |   6 +-
 2 files changed, 138 insertions(+), 126 deletions(-)

diff --git a/test/test_autograd.py b/test/test_autograd.py
index 2308140f6dd..e4e7db5ee63 100644
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@@ -17,10 +17,6 @@ from functools import reduce, partial
 import torch
 import json
 
-# TODO: remove this global setting
-# Autograd tests use double as the default dtype
-torch.set_default_dtype(torch.double)
-
 from torch import nn
 from torch._six import inf, nan
 from torch.autograd.function import once_differentiable
@@ -327,9 +323,9 @@ class TestAutograd(TestCase):
                     grads[1] * var2 * scale + grads[4] * scale,
                 )
 
-        t1 = torch.rand(10, requires_grad=True)
-        t2 = torch.rand(10, requires_grad=True)
-        t3 = torch.rand(10)
+        t1 = torch.rand(10, dtype=torch.double, requires_grad=True)
+        t2 = torch.rand(10, dtype=torch.double, requires_grad=True)
+        t3 = torch.rand(10, dtype=torch.double)
         scale = random.randint(0, 10)
         res = MyFunction.apply(t1, t2, scale, t3)
         self.assertEqual(scale, res[0])
@@ -450,7 +446,7 @@ class TestAutograd(TestCase):
 
     @skipIfNoLapack
     def test_slogdet_sign(self):
-        a = torch.randn(3, 3, requires_grad=True)
+        a = torch.randn(3, 3, dtype=torch.double, requires_grad=True)
         s, logdet = a.slogdet()
 
         # test that sign should not require grad
@@ -763,6 +759,7 @@ class TestAutograd(TestCase):
     def test_hooks_cpp(self):
         # Tests hooks for autograd function implemented in C++
         bn = torch.nn.BatchNorm1d(5, affine=False)
+        bn.double()
         bn.eval()
 
         counter = [0]
@@ -771,13 +768,13 @@ class TestAutograd(TestCase):
             counter[0] += 1
             return grad * 2
 
-        x = torch.ones(5, 5, requires_grad=True)
+        x = torch.ones(5, 5, dtype=torch.double, requires_grad=True)
         z = bn(x)
         z.register_hook(bw_hook)
         z.sum().backward()
 
         self.assertEqual(counter[0], 1, msg='bw_hook not called')
-        self.assertEqual(x.grad, torch.ones(5, 5) * 2, atol=1e-5, rtol=0)
+        self.assertEqual(x.grad, torch.ones(5, 5, dtype=torch.double) * 2, atol=1e-5, rtol=0)
 
     def test_hook_none(self):
         # WARNING: this is a test for autograd internals.
@@ -891,15 +888,15 @@ class TestAutograd(TestCase):
         fn = FixedGradientFunction
 
         # sparse first
-        x = torch.randn(size, requires_grad=True)
+        x = torch.randn(size, dtype=torch.double, requires_grad=True)
         (fn.apply(x, sparse_grad1) + fn.apply(x, dense_grad) + fn.apply(x, sparse_grad2)).sum().backward()
         self.assertEqual(x.grad, dense_grad + sparse_grad1 + sparse_grad2)
         # dense first
-        x = torch.randn(size, requires_grad=True)
+        x = torch.randn(size, dtype=torch.double, requires_grad=True)
         (fn.apply(x, dense_grad) + fn.apply(x, sparse_grad1) + fn.apply(x, sparse_grad2)).sum().backward()
         self.assertEqual(x.grad, dense_grad + sparse_grad1 + sparse_grad2)
         # sparse only
-        x = torch.randn(size, requires_grad=True)
+        x = torch.randn(size, dtype=torch.double, requires_grad=True)
         (fn.apply(x, sparse_grad1) + fn.apply(x, sparse_grad2)).sum().backward()
         self.assertEqual(x.grad, sparse_grad1 + sparse_grad2)
 
@@ -995,8 +992,8 @@ class TestAutograd(TestCase):
         self.assertRaises(RuntimeError, call_backwards)
 
     def test_backward_with_inputs(self):
-        x = torch.randn(2, 2, requires_grad=True)
-        y = torch.randn(2, 2, requires_grad=True)
+        x = torch.randn(2, 2, dtype=torch.double, requires_grad=True)
+        y = torch.randn(2, 2, dtype=torch.double, requires_grad=True)
 
         def fn():
             return x ** 2 + y * x + y ** 2
@@ -1017,31 +1014,31 @@ class TestAutograd(TestCase):
         reset_grad()
         torch.autograd.backward(fn(), gradient, inputs=[x])
         self.assertEqual(x.grad, x_grad_expected)
-        self.assertEqual(y.grad, torch.zeros(2, 2))
+        self.assertEqual(y.grad, torch.zeros(2, 2), exact_dtype=False)
 
         reset_grad()
         torch.autograd.backward(fn(), gradient, inputs=[y])
         self.assertEqual(y.grad, y_grad_expected)
-        self.assertEqual(x.grad, torch.zeros(2, 2))
+        self.assertEqual(x.grad, torch.zeros(2, 2), exact_dtype=False)
 
         reset_grad()
         torch.autograd.backward(fn(), gradient, inputs=y)
         self.assertEqual(y.grad, y_grad_expected)
-        self.assertEqual(x.grad, torch.zeros(2, 2))
+        self.assertEqual(x.grad, torch.zeros(2, 2), exact_dtype=False)
 
         reset_grad()
         self.assertRaisesRegex(RuntimeError, 'cannot be empty',
                                lambda: torch.autograd.backward(fn(), gradient, inputs=[]))
 
     def test_backward_with_nonleaf_inputs(self):
-        x = torch.randn(2, 2, requires_grad=True)
+        x = torch.randn(2, 2, dtype=torch.double, requires_grad=True)
         x_nonleaf = x * 1
-        y = torch.randn(2, 2, requires_grad=True)
-        z = torch.randn(2, 2, requires_grad=True)
+        y = torch.randn(2, 2, dtype=torch.double, requires_grad=True)
+        z = torch.randn(2, 2, dtype=torch.double, requires_grad=True)
 
         out = x_nonleaf ** 2 + y * x_nonleaf + y ** 2
 
-        out.backward(torch.ones(2, 2), create_graph=True, inputs=[x, y])
+        out.backward(torch.ones(2, 2, dtype=torch.double), create_graph=True, inputs=[x, y])
         x_grad_expected = 2 * x + y
         y_grad_expected = x + 2 * y
 
@@ -1049,12 +1046,13 @@ class TestAutograd(TestCase):
         self.assertEqual(x.grad, x_grad_expected)
 
         self.assertRaisesRegex(RuntimeError, 'not a leaf Tensor',
-                               lambda: out.backward(torch.ones(2, 2), create_graph=True, inputs=[x, y, x_nonleaf]))
+                               lambda: out.backward(torch.ones(2, 2, dtype=torch.double),
+                                                    create_graph=True, inputs=[x, y, x_nonleaf]))
 
         # backward doesn't have an allow_unused flag, so the behavior of backward
         # when variable is not part of the graph is as if allow_used were true
         # x.grad will simply be None.
-        out.backward(torch.ones(2, 2), create_graph=True, inputs=[z])
+        out.backward(torch.ones(2, 2, dtype=torch.double), create_graph=True, inputs=[z])
         self.assertIsNone(z.grad)
 
     def test_dependent_backward(self):
@@ -1929,7 +1927,7 @@ class TestAutograd(TestCase):
             self.assertIs(a, b)
             return a + b
 
-        x = torch.randn(5, 5, requires_grad=True)
+        x = torch.randn(5, 5, dtype=torch.double, requires_grad=True)
         gradcheck(fn, [x])
         gradgradcheck(fn, [x])
 
@@ -1950,7 +1948,7 @@ class TestAutograd(TestCase):
             self.assertIs(a, b)
             return a + b
 
-        x = torch.randn(5, 5, requires_grad=True)
+        x = torch.randn(5, 5, dtype=torch.double, requires_grad=True)
         gradcheck(inplace_fn, [x])
         gradgradcheck(inplace_fn, [x])
 
@@ -2030,7 +2028,7 @@ class TestAutograd(TestCase):
 
     def test_select_sum(self):
         # both select and sum return Scalars in ATen; ensure they work together.
-        x = torch.randn(10, requires_grad=True)
+        x = torch.randn(10, dtype=torch.double, requires_grad=True)
 
         def func(x):
             return x.select(0, 1).sum()
@@ -2041,9 +2039,9 @@ class TestAutograd(TestCase):
     def test_diagonal_expanded_v(self):
         value = torch.rand([])
         v_expanded = torch.tensor(value).expand(10)
-        a = torch.rand(10, 10, requires_grad=True)
+        a = torch.rand(10, 10, dtype=torch.double, requires_grad=True)
         result, = torch.autograd.grad(a.diagonal(), a, v_expanded)
-        self.assertEqual(result, torch.eye(10) * value)
+        self.assertEqual(result, torch.eye(10, dtype=torch.double) * value)
 
     def test_select_expanded_v(self):
         v_expanded = torch.rand(10).expand(10, 10)
@@ -2735,28 +2733,28 @@ class TestAutograd(TestCase):
             d.sum().backward()
 
     def test_broadcast_tensors(self):
-        f_args_variable = (torch.randn(3, requires_grad=True),
-                           torch.randn(1, 2, 1, requires_grad=True),
-                           torch.randn(1, 1, requires_grad=True),
-                           torch.randn(5, 1, 1, requires_grad=True))
+        f_args_variable = (torch.randn(3, dtype=torch.double, requires_grad=True),
+                           torch.randn(1, 2, 1, dtype=torch.double, requires_grad=True),
+                           torch.randn(1, 1, dtype=torch.double, requires_grad=True),
+                           torch.randn(5, 1, 1, dtype=torch.double, requires_grad=True))
         f_args_tensor = deepcopy(unpack_variables(f_args_variable))
         run_functional_checks(self, "test_broadcast_tensors", "broadcast",
                               lambda a, b, c, d: torch.broadcast_tensors(a, b, c, d),
                               True, f_args_variable, f_args_tensor)
 
     def test_block_diag(self):
-        f_args_variable = (torch.randn(1, S, requires_grad=True),
-                           torch.randn(2, S, requires_grad=True),
-                           torch.randn(3, S, requires_grad=True))
+        f_args_variable = (torch.randn(1, S, dtype=torch.double, requires_grad=True),
+                           torch.randn(2, S, dtype=torch.double, requires_grad=True),
+                           torch.randn(3, S, dtype=torch.double, requires_grad=True))
         f_args_tensor = deepcopy(unpack_variables(f_args_variable))
         run_functional_checks(self, "test_block_diag", "block_diag",
                               lambda a, b, c: torch.block_diag(a, b, c),
                               True, f_args_variable, f_args_tensor)
 
     def test_cat(self):
-        f_args_variable = (torch.randn(1, S, S, requires_grad=True),
-                           torch.randn(2, S, S, requires_grad=True),
-                           torch.randn(3, S, S, requires_grad=True),
+        f_args_variable = (torch.randn(1, S, S, dtype=torch.double, requires_grad=True),
+                           torch.randn(2, S, S, dtype=torch.double, requires_grad=True),
+                           torch.randn(3, S, S, dtype=torch.double, requires_grad=True),
                            0)
         f_args_tensor = deepcopy(unpack_variables(f_args_variable))
         run_functional_checks(self, "test_cat", "cat",
@@ -2764,9 +2762,9 @@ class TestAutograd(TestCase):
                               True, f_args_variable, f_args_tensor)
 
     def test_cat_negdim_1(self):
-        f_args_variable = (torch.randn(S, S, 1, requires_grad=True),
-                           torch.randn(S, S, 2, requires_grad=True),
-                           torch.randn(S, S, 3, requires_grad=True),
+        f_args_variable = (torch.randn(S, S, 1, dtype=torch.double, requires_grad=True),
+                           torch.randn(S, S, 2, dtype=torch.double, requires_grad=True),
+                           torch.randn(S, S, 3, dtype=torch.double, requires_grad=True),
                            -1)
         f_args_tensor = deepcopy(unpack_variables(f_args_variable))
         run_functional_checks(self, "test_cat_negdim_1", "cat",
@@ -2774,9 +2772,9 @@ class TestAutograd(TestCase):
                               True, f_args_variable, f_args_tensor)
 
     def test_cat_negdim_2(self):
-        f_args_variable = (torch.randn(S, 1, S, requires_grad=True),
-                           torch.randn(S, 2, S, requires_grad=True),
-                           torch.randn(S, 3, S, requires_grad=True),
+        f_args_variable = (torch.randn(S, 1, S, dtype=torch.double, requires_grad=True),
+                           torch.randn(S, 2, S, dtype=torch.double, requires_grad=True),
+                           torch.randn(S, 3, S, dtype=torch.double, requires_grad=True),
                            -2)
         f_args_tensor = deepcopy(unpack_variables(f_args_variable))
         run_functional_checks(self, "test_cat_negdim_2", "cat",
@@ -2784,8 +2782,8 @@ class TestAutograd(TestCase):
                               True, f_args_variable, f_args_tensor)
 
     def test_cat_empty_legacy(self):
-        f_args_variable = (torch.randn(0, requires_grad=True),
-                           torch.randn(S, S, requires_grad=True))
+        f_args_variable = (torch.randn(0, dtype=torch.double, requires_grad=True),
+                           torch.randn(S, S, dtype=torch.double, requires_grad=True))
         # gradgradcheck doesn't work, probably because legacy size tracking is wrong somewhere,
         # hence False passed below, but gradcheck checked explicitly.
         f_args_tensor = deepcopy(unpack_variables(f_args_variable))
@@ -2795,16 +2793,16 @@ class TestAutograd(TestCase):
         self.assertTrue(gradcheck(lambda a, b: torch.cat((a, b)), f_args_variable, eps=1e-6, atol=PRECISION))
 
     def test_cat_empty(self):
-        f_args_variable = (torch.randn(0, S, requires_grad=True),
-                           torch.randn(S, S, requires_grad=True))
+        f_args_variable = (torch.randn(0, S, dtype=torch.double, requires_grad=True),
+                           torch.randn(S, S, dtype=torch.double, requires_grad=True))
         f_args_tensor = deepcopy(unpack_variables(f_args_variable))
         run_functional_checks(self, "test_cat_empty", "cat",
                               lambda a, b: torch.cat((a, b)),
                               True, f_args_variable, f_args_tensor)
 
     def test_trapz(self):
-        f_args_variable = (torch.randn(2, 3, requires_grad=True),
-                           torch.tensor([[1.0, 2.0, 5.5], [2.3, 0.5, 6.2]], requires_grad=True))
+        f_args_variable = (torch.randn(2, 3, dtype=torch.double, requires_grad=True),
+                           torch.tensor([[1.0, 2.0, 5.5], [2.3, 0.5, 6.2]], dtype=torch.double, requires_grad=True))
         f_args_tensor = deepcopy(unpack_variables(f_args_variable))
         run_functional_checks(self, "test_trapz", "trapz",
                               lambda y, x: torch.trapz(y, x),
@@ -3379,9 +3377,9 @@ class TestAutograd(TestCase):
 
     def _test_lerp_tensor_weights(self, cast):
         def construct_inputs(*shapes):
-            start = cast(torch.randn(shapes[0])).requires_grad_()
-            end = cast(torch.randn(shapes[1])).requires_grad_()
-            weight = cast(torch.randn(shapes[2])).requires_grad_()
+            start = cast(torch.randn(shapes[0], dtype=torch.double)).requires_grad_()
+            end = cast(torch.randn(shapes[1], dtype=torch.double)).requires_grad_()
+            weight = cast(torch.randn(shapes[2], dtype=torch.double)).requires_grad_()
             return [start, end, weight]
 
         all_test_shapes = [((3, 3, 3), (3, 3, 3), (3, 3, 3)),  # no broadcasting
@@ -3945,10 +3943,10 @@ class TestAutograd(TestCase):
             def fn(sparse):
                 return torch.sparse.sum(sparse)
 
-            gradcheck(fn, torch.rand(10).to_sparse().requires_grad_(True), check_sparse_nnz=True,
+            gradcheck(fn, torch.rand(10, dtype=torch.double).to_sparse().requires_grad_(True), check_sparse_nnz=True,
                       check_batched_grad=False, fast_mode=fast_mode)
             with self.assertRaisesRegex(RuntimeError, 'gradcheck expects all tensor inputs are dense'):
-                gradcheck(fn, torch.rand(10).to_sparse().requires_grad_(True), check_sparse_nnz=False,
+                gradcheck(fn, torch.rand(10, dtype=torch.double).to_sparse().requires_grad_(True), check_sparse_nnz=False,
                           check_batched_grad=False, fast_mode=fast_mode)
         check(fast_mode=True)
         check(fast_mode=False)
@@ -3965,7 +3963,7 @@ class TestAutograd(TestCase):
                 return NonDetFunc.apply(grad_out, ctx._jitter) * (1 + torch.rand_like(grad_out) * ctx._jitter), None
 
         def check(fast_mode):
-            inp = torch.randn(5, 5, requires_grad=True)
+            inp = torch.randn(5, 5, dtype=torch.double, requires_grad=True)
             gradcheck(lambda x: NonDetFunc.apply(x, 0.0), inp, check_batched_grad=False, fast_mode=fast_mode)
             with self.assertRaisesRegex(RuntimeError, 'Backward is not reentrant'):
                 gradcheck(lambda x: NonDetFunc.apply(x, 1e-6), inp, check_batched_grad=False, fast_mode=fast_mode)
@@ -4040,7 +4038,7 @@ class TestAutograd(TestCase):
 
     def test_gradcheck_check_batched_grad(self):
         def check(fast_mode):
-            x = torch.rand(10, requires_grad=True).to_sparse()
+            x = torch.rand(10, dtype=torch.double, requires_grad=True).to_sparse()
             # runtime error while compute batched grad (print big error)
             with self.assertRaisesRegex(RuntimeError, 'gradcheck or gradgradcheck failed while testing batched gradient'):
                 gradcheck(lambda x: x.to_dense(), (x,), check_sparse_nnz=True, check_batched_grad=True, fast_mode=fast_mode)
@@ -4060,7 +4058,7 @@ class TestAutograd(TestCase):
                 y = x.clone()
                 y.register_hook(hook)
                 return y.to_dense()
-            x = torch.ones((2, 2), requires_grad=True).to_sparse()
+            x = torch.ones((2, 2), dtype=torch.double, requires_grad=True).to_sparse()
             with self.assertRaisesRegex(RuntimeError, 'grad is sparse tensor, but has incorrect sparse_dim'):
                 gradcheck(fn, (x,), atol=1e-1, check_sparse_nnz=True, check_batched_grad=False, fast_mode=fast_mode)
             self.assertFalse(gradcheck(fn, (x,), atol=1e-1, check_sparse_nnz=True, check_batched_grad=False,
@@ -4071,7 +4069,7 @@ class TestAutograd(TestCase):
                 y = x.clone()
                 y.register_hook(lambda x: x + 1e-2)
                 return y
-            x = torch.ones(1, requires_grad=True)
+            x = torch.ones(1, dtype=torch.double, requires_grad=True)
             with self.assertRaisesRegex(RuntimeError, 'backward not multiplied by grad_output'):
                 gradcheck(fn2, (x,), atol=1e-1, fast_mode=fast_mode)
             self.assertFalse(gradcheck(fn2, (x,), atol=1e-1, raise_exception=False, fast_mode=fast_mode))
@@ -4081,7 +4079,7 @@ class TestAutograd(TestCase):
                 y = x.clone().to_dense()
                 y.register_hook(lambda x: x + 1e-2)
                 return y
-            x = torch.ones(1, requires_grad=True).to_sparse()
+            x = torch.ones(1, dtype=torch.double, requires_grad=True).to_sparse()
             with self.assertRaisesRegex(RuntimeError, 'backward not multiplied by grad_output'):
                 gradcheck(fn3, (x,), atol=1e-1, check_sparse_nnz=True, check_batched_grad=False, fast_mode=fast_mode)
             self.assertFalse(gradcheck(fn3, (x,), atol=1e-1, check_sparse_nnz=True, check_batched_grad=False,
@@ -4096,7 +4094,7 @@ class TestAutograd(TestCase):
                 @staticmethod
                 def backward(ctx, x):
                     return x.to_sparse()
-            x = torch.ones(1, requires_grad=True)
+            x = torch.ones(1, dtype=torch.double, requires_grad=True)
             with self.assertRaisesRegex(RuntimeError, 'grad is incorrect layout'):
                 gradcheck(Test.apply, (x,), check_batched_grad=False, fast_mode=fast_mode)
             self.assertFalse(gradcheck(Test.apply, (x,), check_batched_grad=False, raise_exception=False, fast_mode=fast_mode))
@@ -4113,7 +4111,7 @@ class TestAutograd(TestCase):
                 y = x.clone()
                 y.register_hook(hook)
                 return y
-            x = torch.ones(1, requires_grad=True)
+            x = torch.ones(1, dtype=torch.double, requires_grad=True)
             with self.assertWarnsRegex(UserWarning, "Backwards compatibility: New undefined gradient support checking feature"):
                 with self.assertRaisesRegex(RuntimeError, 'Expected backward function to handle undefined output grads'):
                     gradcheck(fn, (x,), fast_mode=fast_mode)
@@ -4160,8 +4158,8 @@ class TestAutograd(TestCase):
         def check(fast_mode):
             def fn(x, y):
                 return x * y.coalesce().to_dense()
-            a = torch.rand(2, 2, requires_grad=True)
-            b = torch.rand(2, 2).to_sparse().requires_grad_(True)
+            a = torch.rand(2, 2, dtype=torch.double, requires_grad=True)
+            b = torch.rand(2, 2, dtype=torch.double,).to_sparse().requires_grad_(True)
             self.assertTrue(gradcheck(fn, (a, b), check_sparse_nnz=True, check_batched_grad=False, fast_mode=fast_mode))
         check(fast_mode=True)
         check(fast_mode=False)
@@ -4189,7 +4187,7 @@ class TestAutograd(TestCase):
                     return torch.cat([x, x])
                 else:
                     return x
-            a = torch.ones(1, requires_grad=True)
+            a = torch.ones(1, dtype=torch.double, requires_grad=True)
             with self.assertRaisesRegex(AssertionError, 'return outputs with the same shape when inputs are perturbed'):
                 self.assertTrue(gradcheck(fn, (a,), fast_mode=fast_mode))
 
@@ -4231,12 +4229,12 @@ class TestAutograd(TestCase):
 
         with self.assertWarnsRegex(UserWarning, "get_numerical_jacobian was part of PyTorch's private API"):
             jacobian = get_numerical_jacobian(fn, (a, b), target=a, eps=1e-6)
-        self.assertEqual(jacobian[0], 2 * torch.eye(4))
+        self.assertEqual(jacobian[0], 2 * torch.eye(4, dtype=torch.double))
 
         with self.assertWarnsRegex(UserWarning, "get_numerical_jacobian was part of PyTorch's private API"):
-            jacobian = get_numerical_jacobian(fn, (a, b), eps=1e-6, grad_out=1)
-        self.assertEqual(jacobian[0], 2 * torch.eye(4))
-        self.assertEqual(jacobian[1], 1 * torch.eye(4))
+            jacobian = get_numerical_jacobian(fn, (a, b), eps=1e-6)
+        self.assertEqual(jacobian[0], 2 * torch.eye(4, dtype=torch.double))
+        self.assertEqual(jacobian[1], 1 * torch.eye(4, dtype=torch.double))
 
         with self.assertRaisesRegex(ValueError, "Expected grad_out to be 1.0"):
             jacobian = get_numerical_jacobian(fn, (a, b), eps=1e-6, grad_out=2.0)
@@ -4253,8 +4251,8 @@ class TestAutograd(TestCase):
         outputs = fn(a, b)
         with self.assertWarnsRegex(UserWarning, "get_analytical_jacobian was part of PyTorch's private API"):
             jacobians, reentrant, correct_grad_sizes, correct_grad_types = get_analytical_jacobian((a, b), outputs[0])
-        self.assertEqual(jacobians[0], 2 * torch.eye(4))
-        self.assertEqual(jacobians[1], 1 * torch.eye(4))
+        self.assertEqual(jacobians[0], 2 * torch.eye(4, dtype=torch.double))
+        self.assertEqual(jacobians[1], 1 * torch.eye(4, dtype=torch.double))
         self.assertTrue(reentrant)
 
         class NonDetFunc(Function):
@@ -5157,7 +5155,7 @@ for shape in [(1,), ()]:
         self.assertEqual(b.grad, torch.tensor([-inf, 0., 0.]))
 
     def test_nansum_with_nans(self):
-        a = torch.randn(2, 2, 2, 2)
+        a = torch.randn(2, 2, 2, 2, dtype=torch.double)
         with torch.no_grad():
             a[a < 0.2] = float('nan')
         a.requires_grad = True
@@ -5198,7 +5196,7 @@ for shape in [(1,), ()]:
         test(inp, torch.double, torch.float)
 
     def test_nan_to_num(self):
-        a = torch.randn(3, 3, 3, 3)
+        a = torch.randn(3, 3, 3, 3, dtype=torch.double)
         with torch.no_grad():
             a[torch.rand_like(a) < 0.2] = float('nan')
             a[torch.rand_like(a) < 0.2] = float('inf')
@@ -5891,8 +5889,8 @@ class TestAutogradFunctional(TestCase):
     def test_vjp_create_graph(self):
         def reducer(x):
             return x.sum(dim=1)
-        inputs = torch.rand(2, 2)
-        v = torch.ones(2)
+        inputs = torch.rand(2, 2, dtype=torch.double)
+        v = torch.ones(2, dtype=torch.double)
 
         inputs.requires_grad_()
         v.requires_grad_()
@@ -5907,8 +5905,10 @@ class TestAutogradFunctional(TestCase):
         def adder(x, y):
             return 2 * x + 3 * y, x * y
 
-        inputs = (torch.rand(2, requires_grad=True), torch.rand(2, requires_grad=True))
-        v = (torch.tensor([1., 0.], requires_grad=True), torch.tensor([1., 0.], requires_grad=True))
+        inputs = (torch.rand(2, dtype=torch.double, requires_grad=True),
+                  torch.rand(2, dtype=torch.double, requires_grad=True))
+        v = (torch.tensor([1., 0.], dtype=torch.double, requires_grad=True),
+             torch.tensor([1., 0.], dtype=torch.double, requires_grad=True))
 
         gradcheck(lambda *args: autogradF.vjp(adder, args[:2], args[2:], create_graph=True)[1], inputs + v)
         gradgradcheck(lambda *args: autogradF.vjp(adder, args[:2], args[2:], create_graph=True)[1], inputs + v)
@@ -6061,8 +6061,8 @@ class TestAutogradFunctional(TestCase):
     def test_jvp_create_graph(self):
         def reducer(x):
             return x.sum(dim=1)
-        inputs = torch.rand(2, 2)
-        v = torch.ones(2, 2)
+        inputs = torch.rand(2, 2, dtype=torch.double)
+        v = torch.ones(2, 2, dtype=torch.double)
 
         inputs.requires_grad_()
         v.requires_grad_()
@@ -6077,8 +6077,10 @@ class TestAutogradFunctional(TestCase):
         def adder(x, y):
             return 2 * x + 3 * y, x * y
 
-        inputs = (torch.rand(2, requires_grad=True), torch.rand(2, requires_grad=True))
-        v = (torch.tensor([1., 0.], requires_grad=True), torch.tensor([1., 0.], requires_grad=True))
+        inputs = (torch.rand(2, dtype=torch.double, requires_grad=True),
+                  torch.rand(2, dtype=torch.double, requires_grad=True))
+        v = (torch.tensor([1., 0.], dtype=torch.double, requires_grad=True),
+             torch.tensor([1., 0.], dtype=torch.double, requires_grad=True))
 
         gradcheck(lambda *args: autogradF.jvp(adder, args[:2], args[2:], create_graph=True)[1], inputs + v)
         gradgradcheck(lambda *args: autogradF.jvp(adder, args[:2], args[2:], create_graph=True)[1], inputs + v)
@@ -6293,7 +6295,7 @@ class TestAutogradFunctional(TestCase):
         def exp_reducer(x):
             return x.exp().sum(dim=1)
 
-        inputs = torch.rand(4, 4, requires_grad=True)
+        inputs = torch.rand(4, 4, dtype=torch.double, requires_grad=True)
         res = autogradF.jacobian(exp_reducer, inputs, create_graph=True, vectorize=vectorize)
         self._assert_interleaved_struct(res, exp_reducer(inputs), inputs)
         self.assertIsNotNone(res.grad_fn)
@@ -6304,7 +6306,8 @@ class TestAutogradFunctional(TestCase):
         def add_exp_reducer(x, y):
             return (x + y).exp().sum(dim=1)
 
-        inputs = (torch.rand(4, 4, requires_grad=True), torch.rand(4, 4, requires_grad=True))
+        inputs = (torch.rand(4, 4, dtype=torch.double, requires_grad=True),
+                  torch.rand(4, 4, dtype=torch.double, requires_grad=True))
         res = autogradF.jacobian(add_exp_reducer, inputs, create_graph=True, vectorize=vectorize)
         self._assert_interleaved_struct(res, add_exp_reducer(*inputs), inputs)
         self.assertIsNotNone(res[0].grad_fn)
@@ -6603,7 +6606,7 @@ class TestAutogradFunctional(TestCase):
         def pow_reducer(x):
             return x.pow(3).sum()
 
-        inputs = torch.rand(2, 2, requires_grad=True)
+        inputs = torch.rand(2, 2, dtype=torch.double, requires_grad=True)
         res = autogradF.hessian(pow_reducer, inputs, create_graph=True, vectorize=vectorize)
         self._assert_interleaved_struct(res, inputs, inputs)
         self.assertIsNotNone(res.grad_fn)
@@ -6614,7 +6617,8 @@ class TestAutogradFunctional(TestCase):
         def add_pow_reducer(x, y):
             return (x + y).pow(3).sum()
 
-        inputs = (torch.rand(2, 2, requires_grad=True), torch.rand(2, 2, requires_grad=True))
+        inputs = (torch.rand(2, 2, dtype=torch.double, requires_grad=True),
+                  torch.rand(2, 2, dtype=torch.double, requires_grad=True))
         res = autogradF.hessian(add_pow_reducer, inputs, create_graph=True, vectorize=vectorize)
         self._assert_interleaved_struct(res, inputs, inputs)
         self.assertIsNotNone(res[0][0].grad_fn)
@@ -6783,8 +6787,8 @@ class TestAutogradFunctional(TestCase):
         def foo(a):
             return 3 * a.narrow(0, 0, 3).exp().sum()
 
-        inputs = torch.rand(4, 4, requires_grad=True)
-        v = torch.ones(4, 4, requires_grad=True)
+        inputs = torch.rand(4, 4, dtype=torch.double, requires_grad=True)
+        v = torch.ones(4, 4, dtype=torch.double, requires_grad=True)
         res = autogradF.vhp(foo, inputs, v, create_graph=True)
         self._assert_same_struct(res[1], inputs)
         self.assertIsNotNone(res[0].grad_fn)
@@ -6796,8 +6800,10 @@ class TestAutogradFunctional(TestCase):
         def bar(a, b):
             return (a + 3 * b.narrow(0, 0, 3)).exp().sum()
 
-        inputs = (torch.rand(3, requires_grad=True), torch.rand(4, requires_grad=True))
-        v = (torch.ones(3, requires_grad=True), torch.ones(4, requires_grad=True))
+        inputs = (torch.rand(3, dtype=torch.double, requires_grad=True),
+                  torch.rand(4, dtype=torch.double, requires_grad=True))
+        v = (torch.ones(3, dtype=torch.double, requires_grad=True),
+             torch.ones(4, dtype=torch.double, requires_grad=True))
         out, vhp_val = autogradF.vhp(bar, inputs, v, create_graph=True)
         self._assert_same_struct(vhp_val, inputs)
         self.assertIsNotNone(out.grad_fn)
@@ -6958,8 +6964,8 @@ class TestAutogradFunctional(TestCase):
         def foo(a):
             return 3 * a.narrow(0, 0, 3).exp().sum()
 
-        inputs = torch.rand(4, 4, requires_grad=True)
-        v = torch.ones(4, 4, requires_grad=True)
+        inputs = torch.rand(4, 4, dtype=torch.double, requires_grad=True)
+        v = torch.ones(4, 4, dtype=torch.double, requires_grad=True)
         res = autogradF.hvp(foo, inputs, v, create_graph=True)
         self._assert_same_struct(res[1], inputs)
         self.assertIsNotNone(res[0].grad_fn)
@@ -6971,8 +6977,10 @@ class TestAutogradFunctional(TestCase):
         def bar(a, b):
             return (a + 3 * b.narrow(0, 0, 3)).exp().sum()
 
-        inputs = (torch.rand(3, requires_grad=True), torch.rand(4, requires_grad=True))
-        v = (torch.ones(3, requires_grad=True), torch.ones(4, requires_grad=True))
+        inputs = (torch.rand(3, dtype=torch.double, requires_grad=True),
+                  torch.rand(4, dtype=torch.double, requires_grad=True))
+        v = (torch.ones(3, dtype=torch.double, requires_grad=True),
+             torch.ones(4, dtype=torch.double, requires_grad=True))
         out, hvp_val = autogradF.hvp(bar, inputs, v, create_graph=True)
         self._assert_same_struct(hvp_val, inputs)
         self.assertIsNotNone(out.grad_fn)
@@ -7216,8 +7224,8 @@ class TestAutogradForwardMode(TestCase):
             self.assertEqual(t, bar * 2)
 
     def test_view_inplace_non_differentiable_views(self):
-        original_foo = torch.rand(2)
-        original_bar = torch.ones(2)
+        original_foo = torch.rand(2, dtype=torch.double)
+        original_bar = torch.ones(2, dtype=torch.double)
 
         # Do clones to be able to compare the values updated inplace
         # with the original content of these Tensors
@@ -7380,7 +7388,7 @@ class TestAutogradDeviceType(TestCase):
             assert torch.isfinite(x.grad).all()
 
     def test_parameter_resize(self, device):
-        asd = torch.nn.Parameter(torch.ones(16, device=device))
+        asd = torch.nn.Parameter(torch.ones(16, dtype=torch.double, device=device))
 
         for i in range(2):
             with torch.no_grad():
@@ -7398,7 +7406,7 @@ class TestAutogradDeviceType(TestCase):
             i.mul_(torch.tensor(size[:sparse_dim]).unsqueeze(1).to(i))
             i = i.to(torch.long)
 
-            inp = torch.randn(v_size, requires_grad=True)
+            inp = torch.randn(v_size, dtype=torch.double, requires_grad=True)
             other = self.genSparseTensor(size, sparse_dim, nnz, is_uncoalesced=True, device=device,
                                          dtype=torch.double)[0]
 
@@ -7445,7 +7453,7 @@ class TestAutogradDeviceType(TestCase):
             self.assertEqual(integral_conv(l), pyscalar)
 
             # floating point -> floating point
-            f = Variable(t(torch.randn(1, 1)))
+            f = Variable(t(torch.randn(1, 1, dtype=torch.double)))
             pyscalar = -12345.1
             f[0] = pyscalar
             self.assertEqual(float(f), pyscalar)
@@ -7603,8 +7611,8 @@ class TestAutogradDeviceType(TestCase):
         output.sum().backward()
 
     def test_where_functional(self, device):
-        x = torch.randn(5, 5, device=device, requires_grad=True)
-        y = torch.randn(5, 5, device=device, requires_grad=True)
+        x = torch.randn(5, 5, dtype=torch.double, device=device, requires_grad=True)
+        y = torch.randn(5, 5, dtype=torch.double, device=device, requires_grad=True)
         cond = mask_not_all_zeros((5, 5)).to(device=device)
 
         def where(cond, x, y):
@@ -7613,13 +7621,13 @@ class TestAutogradDeviceType(TestCase):
         gradcheck(where, [cond, x, y], raise_exception=True)
         gradgradcheck(where, [cond, x, y], [torch.randn(5, 5, device=device)])
 
-        x = torch.randn(5, 1, 5, device=device, requires_grad=True)
-        y = torch.randn(5, 5, 1, device=device, requires_grad=True)
+        x = torch.randn(5, 1, 5, dtype=torch.double, device=device, requires_grad=True)
+        y = torch.randn(5, 5, 1, dtype=torch.double, device=device, requires_grad=True)
         gradcheck(where, [cond, x, y], raise_exception=True)
         gradgradcheck(where, [cond, x, y], [torch.randn(5, 5, 5, device=device)])
 
     def test_where_scalar(self, device):
-        x = torch.randn(5, 5, device=device, requires_grad=True)
+        x = torch.randn(5, 5, dtype=torch.double, device=device, requires_grad=True)
         scalar = 4.
         cond = mask_not_all_zeros((5, 5)).to(device=device)
 
@@ -7662,7 +7670,7 @@ class TestAutogradDeviceType(TestCase):
         for input_length, vary_lengths, zero_mode in tests:
             targets = torch.randint(1, num_labels, (batch_size, target_length),
                                     device=device, dtype=torch.long)
-            x = torch.randn(gradcheck_input_size, device=device, requires_grad=True)
+            x = torch.randn(gradcheck_input_size, dtype=torch.double, device=device, requires_grad=True)
             tile_factors = torch.randn(input_length * batch_size * num_labels // gradcheck_input_size + 1,
                                        device=device)
             input_lengths = [(torch.randint(input_length // 2, input_length + 1, ()).item()
@@ -7757,7 +7765,7 @@ class TestAutogradDeviceType(TestCase):
 
     @onlyCUDA
     def test_pin_memory(self, device):
-        x = torch.randn(2, 2, requires_grad=True)
+        x = torch.randn(2, 2, dtype=torch.double, requires_grad=True)
         self.assertEqual(x, x.pin_memory())
         self.assertIsNot(x, x.pin_memory())
         self.assertTrue(x.pin_memory().requires_grad)
@@ -8067,7 +8075,7 @@ class TestAutogradDeviceType(TestCase):
 
     def test_inplace_view_then_no_grad(self, device):
         # Perform an in-place operation on a view of a non-leaf variable.
-        a = torch.ones(3, 1, device=device, requires_grad=True)
+        a = torch.ones(3, 1, dtype=torch.double, device=device, requires_grad=True)
         b = a * 2
         c = b.view_as(b)
         c[0][0] = 3
@@ -8080,8 +8088,8 @@ class TestAutogradDeviceType(TestCase):
 
     def test_inplace_view_gradcheck(self, device):
         # gradcheck modifications to views
-        a = torch.randn(4, 4, device=device, requires_grad=True)
-        b = torch.randn(2, 2, device=device, requires_grad=True)
+        a = torch.randn(4, 4, dtype=torch.double, device=device, requires_grad=True)
+        b = torch.randn(2, 2, dtype=torch.double, device=device, requires_grad=True)
 
         def func(root, b):
             x = root.clone()
@@ -8090,25 +8098,25 @@ class TestAutogradDeviceType(TestCase):
             return x
 
         gradcheck(func, [a, b], raise_exception=True)
-        go = torch.randn(a.size(), device=device, requires_grad=True)
+        go = torch.randn(a.size(), dtype=torch.double, device=device, requires_grad=True)
         gradgradcheck(func, (a, b), (go,))
 
     def test_inplace_view_multiple_outputs(self, device):
-        root = torch.arange(9.).reshape(3, 3).requires_grad_()
+        root = torch.arange(9., dtype=torch.double).reshape(3, 3).requires_grad_()
         x = root.clone()
         v1 = x.unbind()
         with self.assertRaises(RuntimeError):
             v1[0].mul_(2)
 
     def test_inplace_view_of_multiple_output_view(self, device):
-        a = torch.rand(10, device=device, requires_grad=True).clone()
+        a = torch.rand(10, dtype=torch.double, device=device, requires_grad=True).clone()
         b = a.unbind(0)
         c = b[0].view_as(b[0])
         with self.assertRaises(RuntimeError):
             c.mul_(2)
 
     def test_inplace_multiple_output_view_of_view(self, device):
-        a = torch.rand(10, device=device, requires_grad=True).clone()
+        a = torch.rand(10, dtype=torch.double, device=device, requires_grad=True).clone()
         b = a.view_as(a)
         c = b.unbind(0)
         with self.assertRaises(RuntimeError):
@@ -8116,8 +8124,8 @@ class TestAutogradDeviceType(TestCase):
 
     def test_inplace_view_makes_base_require_grad(self, device):
         # in-place modification to view makes base require grad
-        a = torch.randn(4, 4, device=device, requires_grad=False)
-        b = torch.randn(4, 2, device=device, requires_grad=True)
+        a = torch.randn(4, 4, dtype=torch.double, device=device, requires_grad=False)
+        b = torch.randn(4, 2, dtype=torch.double, device=device, requires_grad=True)
 
         def func(root, b):
             x = root.clone()
@@ -8127,7 +8135,7 @@ class TestAutogradDeviceType(TestCase):
             return x
 
         gradcheck(func, [a, b], raise_exception=True)
-        go = torch.randn(a.size(), device=device, requires_grad=True)
+        go = torch.randn(a.size(), dtype=torch.double, device=device, requires_grad=True)
         gradgradcheck(func, (a, b), (go,))
 
     def test_inplace_view_backprop_view(self, device):
@@ -8143,10 +8151,10 @@ class TestAutogradDeviceType(TestCase):
         # Test that an in-place operation on a base that forced it to require
         # grad also forces any previous views to require grad and backprop
         # correctly
-        r = torch.ones(1, device=device, requires_grad=True)
+        r = torch.ones(1, dtype=torch.double, device=device, requires_grad=True)
 
         def fn(r):
-            x = torch.ones(5, device=device)
+            x = torch.ones(5, dtype=torch.double, device=device)
             v = x.select(0, 1)
             self.assertFalse(v.requires_grad)
             self.assertIsNone(v.grad_fn)
@@ -8159,8 +8167,8 @@ class TestAutogradDeviceType(TestCase):
 
     def test_inplace_view_python(self, device):
         # in-place modifications of Python-autograd created view
-        a = torch.randn(4, 4, device=device, requires_grad=True)
-        b = torch.randn(2, 2, device=device, requires_grad=True)
+        a = torch.randn(4, 4, dtype=torch.double, device=device, requires_grad=True)
+        b = torch.randn(2, 2, dtype=torch.double, device=device, requires_grad=True)
 
         class PyAdd(torch.autograd.Function):
             @staticmethod
@@ -8180,7 +8188,7 @@ class TestAutogradDeviceType(TestCase):
             return x
 
         gradcheck(func, [a, b], raise_exception=True)
-        go = torch.randn(a.size(), device=device, requires_grad=True)
+        go = torch.randn(a.size(), dtype=torch.double, device=device, requires_grad=True)
         gradgradcheck(func, (a, b), (go,))
 
     def test_inplace_view_non_contig(self, device):
@@ -8215,8 +8223,8 @@ class TestAutogradDeviceType(TestCase):
 
     def test_mv_grad_stride_0(self, device):
         # Reference: https://github.com/pytorch/pytorch/issues/38315
-        mat = torch.randn(2, 2, device=device)
-        vec = torch.randn(1, device=device).requires_grad_(True)
+        mat = torch.randn(2, 2, dtype=torch.double, device=device)
+        vec = torch.randn(1, dtype=torch.double, device=device).requires_grad_(True)
 
         def fn(vec):
             # Expand inside the function to make sure the input to
@@ -8229,10 +8237,10 @@ class TestAutogradDeviceType(TestCase):
 
     @onlyCUDA
     def test_gradcheck_input_output_different_device(self, device):
-        x = torch.ones((1,), device="cuda", requires_grad=True)
+        x = torch.ones((1,), dtype=torch.double, device="cuda", requires_grad=True)
         gradcheck(lambda x: x.to("cpu"), (x,))
 
-        x = torch.ones((1,), device="cpu", requires_grad=True)
+        x = torch.ones((1,), dtype=torch.double, device="cpu", requires_grad=True)
         gradcheck(lambda x: x.to("cuda"), (x,))
 
     def test_logcumsumexp_large_value(self, device):
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 9afcbae8746..37c4c237a13 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -5888,7 +5888,7 @@ shape_funcs = [op for op in op_db if isinstance(op, ShapeFuncInfo)]
 def index_variable(shape, max_indices, device=torch.device('cpu')):
     if not isinstance(shape, tuple):
         shape = (shape,)
-    index = torch.rand(*shape, device=device).mul_(max_indices).floor_().long()
+    index = torch.rand(*shape, dtype=torch.double, device=device).mul_(max_indices).floor_().long()
     return index
 
 
@@ -6216,6 +6216,10 @@ def create_input(call_args, requires_grad=True, non_contiguous=False, call_kwarg
         # double check casting
         elif isinstance(arg, non_differentiable):
             if isinstance(arg.tensor, torch.Tensor):
+                if arg.tensor.dtype == torch.float:
+                    return maybe_non_contig(arg.tensor.to(dtype=torch.double, device=device))
+                if arg.tensor.dtype == torch.cfloat:
+                    return maybe_non_contig(arg.tensor.to(dtype=torch.cdouble, device=device))
                 return maybe_non_contig(arg.tensor.to(device=device))
             return maybe_non_contig(arg.tensor.to(device=device))
         elif isinstance(arg, torch.Tensor):