diff --git a/aten/src/ATen/core/NamedRegistrations.cpp b/aten/src/ATen/core/NamedRegistrations.cpp index d7884eefbd3..053c8a79fe8 100644 --- a/aten/src/ATen/core/NamedRegistrations.cpp +++ b/aten/src/ATen/core/NamedRegistrations.cpp @@ -11,6 +11,7 @@ TORCH_LIBRARY_IMPL(_, Named, m) { TORCH_LIBRARY_IMPL(aten, Named, m) { m.impl("_cdist_forward", CppFunction::makeFallthrough()); m.impl("_fused_dropout", CppFunction::makeFallthrough()); + m.impl("native_dropout", CppFunction::makeFallthrough()); m.impl("_local_scalar_dense", CppFunction::makeFallthrough()); m.impl("_sparse_log_softmax.Dimname", CppFunction::makeFallthrough()); m.impl("_sparse_log_softmax.int", CppFunction::makeFallthrough()); diff --git a/aten/src/ATen/native/Dropout.cpp b/aten/src/ATen/native/Dropout.cpp index c4a6ec6cef5..79cb7c39994 100644 --- a/aten/src/ATen/native/Dropout.cpp +++ b/aten/src/ATen/native/Dropout.cpp @@ -3,7 +3,8 @@ #include #include -namespace at { namespace native { +namespace at { +namespace native { namespace { @@ -85,11 +86,39 @@ ALIAS_SPECIALIZATION(_feature_alpha_dropout, true, true ) } // anomymous namepsace +std::tuple +native_dropout_cpu(const Tensor& input, double p, c10::optional train) { + if (input.numel() == 0) { + return std::make_tuple(input, at::empty_like(input, input.options())); + } + + Tensor mask; + Tensor output; + + if (!train.has_value() || *train) { + double p1m = 1. - p; + // Check for probability of zero to avoid divide by zero and NaN results + double scale = p1m == 0 ? 0. : 1. / p1m; + mask = at::empty_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT); + mask.bernoulli_(p1m); + output = input.mul(mask).mul_(scale); + } else { + mask = at::ones_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT); + output = input.clone(); + } + return std::make_tuple(output, mask); +} + +Tensor native_dropout_backward_cpu(const Tensor& grad, const Tensor& mask, double scale) { + Tensor result = grad * mask * scale; + return result; +} + Tensor dropout(const Tensor& input, double p, bool train) { auto result = [&]() { NoNamesGuard guard; if (train && is_fused_kernel_acceptable(input, p)) { - return std::get<0>(at::_fused_dropout(input, 1 - p)); + return std::get<0>(at::native_dropout(input, p, train)); } return _dropout(input, p, train); }(); @@ -125,4 +154,5 @@ Tensor& feature_alpha_dropout_(Tensor& input, double p, bool train) { return _feature_alpha_dropout(input, p, train); } -}} // namespace at::native +} // namespace native +} // namespace at diff --git a/aten/src/ATen/native/cuda/Dropout.cu b/aten/src/ATen/native/cuda/Dropout.cu index a4b818eb1c4..2d26177ecd6 100644 --- a/aten/src/ATen/native/cuda/Dropout.cu +++ b/aten/src/ATen/native/cuda/Dropout.cu @@ -26,22 +26,22 @@ template < typename accscalar_t, typename IndexType, int ADims, - int VEC> + int VEC, + typename mask_t> #if __CUDA_ARCH__ >= 350 || defined(USE_ROCM) C10_LAUNCH_BOUNDS_2(256, 4) #endif -__global__ void fused_dropout_kernel_vec( - at::cuda::detail::TensorInfo a, - at::cuda::detail::TensorInfo b, - at::cuda::detail::TensorInfo c, - IndexType totalElements, - accscalar_t p, - PhiloxCudaState philox_args) { +__global__ void +fused_dropout_kernel_vec(at::cuda::detail::TensorInfo a, + at::cuda::detail::TensorInfo b, + at::cuda::detail::TensorInfo c, + IndexType totalElements, accscalar_t p, + PhiloxCudaState philox_args) { // make sure we don't break assumption that we can't have > 4 elements / thread static_assert(VEC <= 4, "Value of VEC must be in [2, 4]"); using LoadT = memory::aligned_vector; - using MaskLoadT = memory::aligned_vector; + using MaskLoadT = memory::aligned_vector; auto seeds = at::cuda::philox::unpack(philox_args); IndexType idx = blockIdx.x * blockDim.x + threadIdx.x; @@ -51,11 +51,10 @@ __global__ void fused_dropout_kernel_vec( std::get<1>(seeds), &state); - accscalar_t pinv = accscalar_t(1)/p; - // Helps align the total number of times curand_uniform4 is called by each thread for the same totalElements // in the vec=2 and vec=4 cases. bool gridxvec_loop_state = 0; + accscalar_t scale = 1.0 / p; float4 rand; @@ -94,13 +93,13 @@ __global__ void fused_dropout_kernel_vec( *value = *reinterpret_cast(&a.data[linearIndex]); scalar_t r[VEC]; - uint8_t mask[VEC]; + mask_t mask[VEC]; // Perform the actual computation #pragma unroll for (int ii = 0; ii < VEC; ii++) { - r[ii] = src[ii]*(&rand.x)[ii]*pinv; - mask[ii] = (uint8_t)(&rand.x)[ii]; + r[ii] = src[ii]*(&rand.x)[ii]*scale; + mask[ii] = (mask_t)(&rand.x)[ii]; } // Vectorized writes for both mask & result *(reinterpret_cast(&b.data[linearIndex])) = *reinterpret_cast(&r[0]); @@ -115,17 +114,17 @@ template < typename accscalar_t, typename IndexType, int ADims, - int BDims = ADims> + int BDims = ADims, + typename mask_t> #if __CUDA_ARCH__ >= 350 || defined(USE_ROCM) C10_LAUNCH_BOUNDS_2(256, 4) #endif -__global__ void fused_dropout_kernel( - cuda::detail::TensorInfo a, - cuda::detail::TensorInfo b, - cuda::detail::TensorInfo c, - IndexType totalElements, - accscalar_t p, - PhiloxCudaState philox_args) { +__global__ void +fused_dropout_kernel(cuda::detail::TensorInfo a, + cuda::detail::TensorInfo b, + cuda::detail::TensorInfo c, + IndexType totalElements, accscalar_t p, + PhiloxCudaState philox_args) { auto seeds = at::cuda::philox::unpack(philox_args); IndexType idx = blockIdx.x * blockDim.x + threadIdx.x; curandStatePhilox4_32_10_t state; @@ -133,8 +132,7 @@ __global__ void fused_dropout_kernel( idx, std::get<1>(seeds), &state); - - accscalar_t pinv = accscalar_t(1)/p; + accscalar_t scale = 1.0 / p; IndexType rounded_size = ((totalElements - 1)/(blockDim.x * gridDim.x * UNROLL)+1) * blockDim.x * gridDim.x * UNROLL; @@ -163,15 +161,15 @@ __global__ void fused_dropout_kernel( // Convert `linearIndex` into an offset of `b` const IndexType bOffset = cuda::detail::IndexToOffset::get(li, b); - b.data[bOffset] = src[ii]*(&rand.x)[ii]*pinv; - c.data[bOffset] = (uint8_t)(&rand.x)[ii]; + b.data[bOffset] = src[ii]*(&rand.x)[ii]*scale; + c.data[bOffset] = (mask_t)(&rand.x)[ii]; } } __syncthreads(); } } -template +template void masked_scale_kernel(at::Tensor& ret, const at::Tensor& src, const at::Tensor& mask, accscalar_t scale){ auto iter = at::TensorIteratorConfig() .check_all_same_dtype(false) @@ -182,7 +180,7 @@ void masked_scale_kernel(at::Tensor& ret, const at::Tensor& src, const at::Tenso at::native::gpu_kernel( iter, - [=]GPU_LAMBDA(const scalar_t src_val, const uint8_t mask_val) -> scalar_t { + [=]GPU_LAMBDA(const scalar_t src_val, const mask_t mask_val) -> scalar_t { return (float)mask_val * src_val * scale; }); } @@ -206,7 +204,7 @@ int get_vector_size(at::Tensor self, at::Tensor ret, at::Tensor mask) { return can_vectorize ? vec_size : 1; } -template +template inline void launcher( const Tensor& self, Tensor& ret, @@ -229,7 +227,7 @@ inline void launcher( auto ret_info = cuda::detail::getTensorInfo(ret); auto mask_info = - cuda::detail::getTensorInfo(mask); + cuda::detail::getTensorInfo(mask); self_info.collapseDims(); ret_info.collapseDims(); mask_info.collapseDims(); // ret and mask are collapsed to 1d @@ -321,14 +319,16 @@ inline void launcher( } //anonymous namespace +template std::tuple -fused_dropout_cuda(const Tensor& self, double p, c10::optional gen_){ - auto gen = get_generator_or_default(gen_, cuda::detail::getDefaultCUDAGenerator()); - Tensor ret = at::empty_like(self); - Tensor mask = at::empty_like(self, self.options().dtype(kByte)); +dropout_cuda(CUDAGeneratorImpl* gen, const Tensor& self, double p){ + Tensor mask = at::empty_like(self, self.options().dtype(c10::CppTypeToScalarType::value)); const int64_t nelem = self.numel(); -//empty tensors should not get here, but just in case, avoid FPE - if (nelem==0) return std::tuple(self, mask); + // empty tensors should not get here, but just in case, avoid FPE + // non-training shot-cut + if (nelem==0) return std::tuple(self.clone(), mask); + + Tensor ret = at::empty_like(self); const int64_t block_size = 256; unsigned int blocks_per_sm = at::cuda::getCurrentDeviceProperties()->maxThreadsPerMultiProcessor/block_size; dim3 dim_block(block_size); @@ -343,25 +343,62 @@ fused_dropout_cuda(const Tensor& self, double p, c10::optional gen_){ rng_engine_inputs = gen->philox_cuda_state(counter_offset); } if (cuda::detail::canUse32BitIndexMath(self)){ - launcher( + launcher( self, ret, mask, p, nelem, rng_engine_inputs, grid, dim_block); } else { - launcher( + launcher( self, ret, mask, p, nelem, rng_engine_inputs, grid, dim_block); } return std::tuple(ret, mask); } -Tensor masked_scale_cuda(const Tensor& self, const Tensor& mask, double scale){ - Tensor ret = at::empty_like(self, self.suggest_memory_format()); - TORCH_CHECK(mask.scalar_type() == at::ScalarType::Byte, "mask should be torch.uint8 dtype"); +std::tuple +native_dropout_cuda(const Tensor& self, double p, c10::optional train){ + // short-cut for train == false + if (train.has_value() && !train.value()) { + return std::make_tuple(self.clone(), at::ones_like(self, self.options().dtype(c10::CppTypeToScalarType::value))); + } + // short-cut + if (p == 1) { + // native_dropout_cuda is in derivatives.yaml, so we don't need to add data + // dependency from output to input for autograd + auto ret = at::zeros_like(self); + auto mask = at::zeros_like(self, self.options().dtype(c10::CppTypeToScalarType::value)); + return std::tuple(ret, mask); + } + + auto gen = get_generator_or_default(c10::nullopt, cuda::detail::getDefaultCUDAGenerator()); + double p1m = 1. - p; + return dropout_cuda(gen, self, p1m); +} + +// TODO: _fused_dropout_cuda is to be removed, see PR #63937 +std::tuple +fused_dropout_cuda(const Tensor& self, double p, c10::optional gen_){ + auto gen = get_generator_or_default(gen_, cuda::detail::getDefaultCUDAGenerator()); + return dropout_cuda(gen, self, p); +} + +template +Tensor dropout_backward_cuda(const Tensor& grad, const Tensor& mask, double scale){ + Tensor ret = at::empty_like(grad, grad.suggest_memory_format()); AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, ret.scalar_type(), "masked_scale", [&] { using accscalar_t = acc_type; - accscalar_t pa = (accscalar_t)(scale); - masked_scale_kernel(ret, self, mask, pa); + masked_scale_kernel(ret, grad, mask, (accscalar_t)scale); }); return ret; } +Tensor native_dropout_backward_cuda(const Tensor& grad, const Tensor& mask, double scale){ + TORCH_CHECK(mask.scalar_type() == at::ScalarType::Bool, "Mask should be Bool Scalar Type", mask.scalar_type()); + return dropout_backward_cuda(grad, mask, scale); +} + +// TODO: masked_scale_cuda is to be removed, see PR #63937 +Tensor masked_scale_cuda(const Tensor& self, const Tensor& mask, double scale){ + TORCH_CHECK(mask.scalar_type() == at::ScalarType::Byte, "mask should be torch.uint8 dtype"); + return dropout_backward_cuda(self, mask, scale); +} + } } diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml index e67d66a4446..f43e7780a7b 100644 --- a/aten/src/ATen/native/native_functions.yaml +++ b/aten/src/ATen/native/native_functions.yaml @@ -176,6 +176,17 @@ dispatch: CUDA: masked_scale_cuda +- func: native_dropout(Tensor input, float p, bool? train) -> (Tensor, Tensor) + variants: function + dispatch: + CPU: native_dropout_cpu + CUDA: native_dropout_cuda + +- func: native_dropout_backward(Tensor grad_output, Tensor mask, float scale) -> Tensor + dispatch: + CPU: native_dropout_backward_cpu + CUDA: native_dropout_backward_cuda + - func: _sobol_engine_draw(Tensor quasi, int n, Tensor sobolstate, int dimension, int num_generated, ScalarType? dtype) -> (Tensor, Tensor) - func: _sobol_engine_ff_(Tensor(a!) self, int n, Tensor sobolstate, int dimension, int num_generated) -> Tensor(a!) diff --git a/test/test_jit.py b/test/test_jit.py index 137e8708335..180d1fffb3b 100644 --- a/test/test_jit.py +++ b/test/test_jit.py @@ -1650,6 +1650,30 @@ graph(%Ra, %Rb): m = self.createFunctionFromGraph(g) self.assertEqual(outputs, m(*inputs)) + @unittest.skipIf(not RUN_CUDA, "test requires CUDA") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "skip if profiling isn't enabled") + def test_native_dropout_corner_case(self): + with disable_autodiff_subgraph_inlining(): + def t(x, p: float, t: bool): + o = torch.dropout(x, p, t) + return o + + jit_t = torch.jit.script(t) + x = torch.randn(5).requires_grad_() + FileCheck().check("prim::DifferentiableGraph").run(jit_t.graph_for(x, 1.0, True, profile_and_replay=True)) + + for train in [True, False]: + for p in [0.0, 1.0]: + for device in ["cuda", "cpu"]: + x = torch.randn(5).to(device=device).requires_grad_() + x_ref = x.detach().requires_grad_() + o = jit_t(x, p, train) + o_ref = t(x_ref, p, train) + o.sum().backward() + o_ref.sum().backward() + assert(o.equal(o_ref)) + assert(x.grad.equal(x_ref.grad)) + @slowTest @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.LEGACY, 'Testing differentiable graph') def test_dropout_module_requires_grad(self): @@ -1690,7 +1714,7 @@ graph(%Ra, %Rb): for requires_grad in (True, False): X = torch.randn(M, M, requires_grad=requires_grad) if requires_grad: - FileCheck().check("aten::bernoulli_").run(scripted.graph_for(X, profile_and_replay=True)) + FileCheck().check("aten::native_dropout").run(scripted.graph_for(X, profile_and_replay=True)) self.assertEqual(training, 'aten::bernoulli_' in profile(scripted, X)) @unittest.skipIf(GRAPH_EXECUTOR == ProfilingMode.SIMPLE, 'Testing differentiable graph') @@ -1714,7 +1738,7 @@ graph(%Ra, %Rb): for requires_grad in (True, False): X = torch.randn(M, M, requires_grad=requires_grad) if requires_grad: - FileCheck().check("aten::bernoulli_").run(scripted_training.graph_for(X, profile_and_replay=True)) + FileCheck().check("aten::native_dropout").run(scripted_training.graph_for(X, profile_and_replay=True)) self.assertIn('aten::bernoulli_', profile(scripted_training, X)) self.assertNotIn('aten::bernoulli_', profile(scripted_eval, X)) diff --git a/test/test_nn.py b/test/test_nn.py index 66ef408283e..ba384bf920f 100644 --- a/test/test_nn.py +++ b/test/test_nn.py @@ -6683,6 +6683,20 @@ class TestNN(NNTestCase): bad_input = torch.randn(3, 1) test_all(hidden_size, good_hx, good_hx, input_size, bad_input) + @unittest.skipIf(not TEST_CUDA, "CUDA unavailable") + def test_native_dropout_corner_case(self): + for train in [True, False]: + for p in [0.0, 1.0]: + for device in ["cuda", "cpu"]: + x = torch.randn(5).to(device=device).requires_grad_() + x_ref = x.detach().requires_grad_() + o = torch.native_dropout(x, p, train)[0] + o_ref = torch.dropout(x_ref, p, train) + o.sum().backward() + o_ref.sum().backward() + assert(o.equal(o_ref)) + assert(x.grad.equal(x_ref.grad)) + def test_invalid_dropout_p(self): v = torch.ones(1) self.assertRaises(ValueError, lambda: nn.Dropout(-0.1)) diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml index 201e7453a00..47e7259c582 100644 --- a/tools/autograd/derivatives.yaml +++ b/tools/autograd/derivatives.yaml @@ -534,6 +534,13 @@ - name: _fused_dropout(Tensor self, float p, Generator? generator=None) -> (Tensor, Tensor) self: _fused_dropout_backward(grad, result1, p) +- name: native_dropout(Tensor input, float p, bool? train) -> (Tensor, Tensor) + input: "GradMode::is_enabled() ? infinitely_differentiable_native_dropout_backward(grad, result1, (!train.has_value() || !train.value() ? 1 : (p == 1 ? 0.0 : 1.0 / (1.0 - p)))) : native_dropout_backward(grad, result1, (!train.has_value() || !train.value() ? 1 : (p == 1 ? 0.0 : 1.0 / (1.0 - p))))" + +- name: native_dropout_backward(Tensor grad_output, Tensor mask, float scale) -> Tensor + grad_output: "native_dropout_double_backward(grad, grad_output, mask, scale)" + mask: 'not_implemented("native_dropout_backward: mask")' + - name: eig(Tensor self, bool eigenvectors=False) -> (Tensor eigenvalues, Tensor eigenvectors) self: eig_backward(grads, self, eigenvectors, eigenvalues, eigenvectors_return) diff --git a/torch/csrc/autograd/FunctionsManual.cpp b/torch/csrc/autograd/FunctionsManual.cpp index 74eb1b75df7..37617cd2a9c 100644 --- a/torch/csrc/autograd/FunctionsManual.cpp +++ b/torch/csrc/autograd/FunctionsManual.cpp @@ -927,6 +927,15 @@ Tensor _fused_dropout_backward(Tensor grad, Tensor mask, double p1m) { } } +// scale == (1 / (1 - prob)) +Tensor infinitely_differentiable_native_dropout_backward(const Tensor& grad, const Tensor& mask, double scale) { + return grad * (mask.type_as(grad) * scale); +} + +Tensor native_dropout_double_backward(const Tensor& ggI, const Tensor& grad, const Tensor& mask, double scale) { + return ggI.type_as(grad) * (mask.type_as(grad) * scale); +} + Tensor evenly_distribute_backward(Tensor grad, const Tensor & input, const Tensor & value) { if (input.is_cuda()) { auto mask = (input == value).logical_or_(input.isnan().logical_and_(value.isnan())); diff --git a/torch/csrc/autograd/FunctionsManual.h b/torch/csrc/autograd/FunctionsManual.h index bfb87be3e91..445d88050c4 100644 --- a/torch/csrc/autograd/FunctionsManual.h +++ b/torch/csrc/autograd/FunctionsManual.h @@ -92,6 +92,8 @@ at::Tensor sparse_sparse_matmul_backward(const at::Tensor& grad, const at::Tenso at::Tensor renorm_backward(const at::Tensor & grad, const at::Tensor & self, const at::Scalar& p, int64_t dim, const at::Scalar& maxnorm); at::Tensor repeat_backward(at::Tensor grad, at::IntArrayRef repeats, at::IntArrayRef input_shape); at::Tensor _fused_dropout_backward(at::Tensor grad, at::Tensor mask, double p1m); +at::Tensor infinitely_differentiable_native_dropout_backward(const at::Tensor& grad, const at::Tensor& mask, double scale); +at::Tensor native_dropout_double_backward(const at::Tensor& ggI, const at::Tensor& grad, const at::Tensor& mask, double scale); at::Tensor evenly_distribute_backward(at::Tensor grad, const at::Tensor & input, const at::Tensor & value); at::Tensor sgn_backward(Tensor result, Tensor grad, Tensor self); at::Tensor var_backward(at::Tensor grad, const at::Tensor& self, c10::optional dim, c10::optional correction, bool keepdim); diff --git a/torch/csrc/jit/ir/ir.cpp b/torch/csrc/jit/ir/ir.cpp index cf3bcb8aea4..2448aa68d95 100644 --- a/torch/csrc/jit/ir/ir.cpp +++ b/torch/csrc/jit/ir/ir.cpp @@ -1142,6 +1142,7 @@ bool Node::isNondeterministic() const { "aten::bernoulli(Tensor self, *, Generator? generator) -> Tensor", "aten::bernoulli(Tensor self, float p, *, Generator? generator) -> Tensor", "aten::multinomial(Tensor self, int num_samples, bool replacement, *, Generator? generator) -> Tensor", + "aten::native_dropout(Tensor input, float p, bool? train) -> (Tensor, Tensor)", "aten::normal(Tensor mean, Tensor std, *, Generator? generator) -> Tensor", "aten::normal(float mean, Tensor std, *, Generator? generator) -> Tensor", "aten::normal(Tensor mean, float std, *, Generator? generator) -> Tensor", diff --git a/torch/csrc/jit/runtime/symbolic_script.cpp b/torch/csrc/jit/runtime/symbolic_script.cpp index 23d253e43d5..2ec7fb0046b 100644 --- a/torch/csrc/jit/runtime/symbolic_script.cpp +++ b/torch/csrc/jit/runtime/symbolic_script.cpp @@ -1174,40 +1174,16 @@ const std::vector functions = { return grad_input, None, grad_weight, grad_bias, None, None return output, backward - def AD_fused_dropout_backward(grad, - mask, - p1m: float): - p1r = 1. / p1m - grad_input = grad * (mask.type_as(grad) * p1r) - return grad_input - def dropout(input, p: float, train: bool): - use_cuda = input.is_cuda - # lowering is specialized for cuda because cuda fuser can efficiently fuse those operations - # for cpu backend, where fusions are disabled, a different lowering that is more efficient - # in the absence of fusion is used - p1m = 1. - p - if train: - if use_cuda: - mask = torch.rand_like(input, memory_format=1) < p1m - res = mask.type_as(input) * input * (1./p1m) - else: - mask = torch.empty_like(input, memory_format=1) - mask.bernoulli_(p1m) - res = mask * input / p1m - else: - p1m = 1. - res = input - mask = torch.empty_like(input, memory_format=1) + # if `train == false` we need to set `p1m` to 0 so `scale == 1` + p1m = (1. - p) * float(train) + scale = 1. / (float(p1m == 0.) + p1m) + res,mask = torch.native_dropout(input, p, train) def backward(grad_output): - use_cuda = grad_output.is_cuda - if use_cuda: - grad_input = AD_fused_dropout_backward(grad_output, mask, p1m) - else: - grad_input = grad_output * mask / p1m + grad_input = torch.native_dropout_backward(grad_output, mask, scale) return grad_input, None, None return res, backward diff --git a/torch/overrides.py b/torch/overrides.py index f373d65fa6a..86617b71892 100644 --- a/torch/overrides.py +++ b/torch/overrides.py @@ -664,6 +664,7 @@ def get_testing_overrides() -> Dict[Callable, Callable]: torch.narrow_copy: lambda input, dim, start, length: -1, torch.nan_to_num: lambda input, nan=0.0, posinf=None, neginf=None, out=None: -1, torch.native_batch_norm: lambda input, weight, bias, running_mean, running_var, training, momentum, eps: -1, + torch.native_dropout : lambda input, p, train: -1, torch.native_layer_norm: lambda input, normalized_shape, weight=None, bias=None, eps=1e-05: -1, torch.native_group_norm: lambda input, weight, bias, N, C, HxW, group, eps: -1, torch.native_norm: lambda input, p=2: -1, diff --git a/torch/testing/_internal/jit_metaprogramming_utils.py b/torch/testing/_internal/jit_metaprogramming_utils.py index 36bfeb1103a..b94a6972f3f 100644 --- a/torch/testing/_internal/jit_metaprogramming_utils.py +++ b/torch/testing/_internal/jit_metaprogramming_utils.py @@ -68,9 +68,7 @@ nn_functional_tests = [ ('adaptive_avg_pool1d', (S, S, S), (5,), '', (True,)), ('adaptive_avg_pool2d', (S, S, S, S), ([5, 7],), '', (True,)), ('adaptive_avg_pool3d', (S, S, S, S, S), ([3, 2, 2],), '', (True,)), - ('dropout', (S, S, S), (0.5,), '', (True, - ['aten::bernoulli_', - 'aten::empty_like', 'aten::mul', 'aten::div'])), + ('dropout', (S, S, S), (0.5,), '', (True, 'aten::native_dropout')), ('alpha_dropout', (S, S, S), (0.5,)), ('dropout2d', (S, S, S), (0.5,)), ('dropout3d', (S, S, S), (0.5,)),