pytorch/BUILD.bazel

2035 lines
69 KiB
Text
Raw Normal View History

load("@bazel_skylib//lib:paths.bzl", "paths")
load("@pybind11_bazel//:build_defs.bzl", "pybind_extension")
load("@rules_proto//proto:defs.bzl", "proto_library")
load("@rules_cc//cc:defs.bzl", "cc_binary", "cc_library", "cc_proto_library", "cc_test")
load("//third_party:substitution.bzl", "header_template_rule")
load("//:tools/build_variables.bzl", "torch_cpp_srcs", "libtorch_python_core_sources", "libtorch_core_sources", "libtorch_distributed_sources", "libtorch_extra_sources", "jit_core_sources")
load("//tools/rules:cu.bzl", "cu_library")
load("//tools/config:defs.bzl", "if_cuda")
load("//:aten.bzl", "intern_build_aten_ops")
COMMON_COPTS = [
"-DHAVE_MALLOC_USABLE_SIZE=1",
"-DHAVE_MMAP=1",
"-DHAVE_SHM_OPEN=1",
"-DHAVE_SHM_UNLINK=1",
"-D_FILE_OFFSET_BITS=64",
"-DHAVE_GCC_GET_CPUID",
"-DUSE_GCC_GET_CPUID",
"-DTH_HAVE_THREAD",
"-DUSE_FBGEMM",
"-DUSE_DISTRIBUTED",
"-DATEN_THREADING=NATIVE",
"-DNO_CUDNN_DESTROY_HANDLE",
] + if_cuda([
"-DUSE_CUDA",
"-DUSE_CUDNN",
])
# c10
header_template_rule(
name = "cmake_macros_h",
src = "c10/macros/cmake_macros.h.in",
out = "c10/macros/cmake_macros.h",
substitutions = {
"cmakedefine": "define",
"#define C10_USE_NUMA": "/* #undef C10_USE_NUMA */",
},
)
header_template_rule(
name = "cuda_cmake_macros_h",
src = "c10/cuda/impl/cuda_cmake_macros.h.in",
out = "c10/cuda/impl/cuda_cmake_macros.h",
substitutions = {
"cmakedefine": "define",
},
)
cc_library(
name = "c10_headers",
hdrs = glob([
"c10/core/*.h",
"c10/core/impl/*.h",
"c10/cuda/*.h",
"c10/cuda/impl/*.h",
"c10/macros/*.h",
"c10/mobile/*.h",
"c10/util/*.h",
"c10/util/*.hpp",
]),
deps = [
"@com_github_gflags_gflags//:gflags",
"@com_github_glog//:glog",
":cmake_macros_h",
":cuda_cmake_macros_h",
],
)
cc_library(
name = "c10",
srcs = glob([
"c10/core/*.cpp",
"c10/core/impl/*.cpp",
"c10/mobile/*.cpp",
"c10/util/*.cpp",
]) + if_cuda(
glob([
"c10/cuda/*.cpp",
"c10/cuda/impl/*.cpp",
]),
[],
),
copts = ["-DCAFFE2_BUILD_MAIN_LIB"],
deps = [
":c10_headers",
"@fmt",
] + if_cuda(
["@cuda"],
[],
),
alwayslink = True,
)
cc_test(
name = "c10_tests",
size = "small",
srcs = glob([
"c10/test/util/*.cpp",
"c10/test/util/*.h",
"c10/test/core/*.cpp",
"c10/test/core/impl/*.cpp",
]),
copts = ["-Wno-deprecated-declarations"],
deps = [
":c10",
":c10_headers",
"@com_google_googletest//:gtest_main",
],
)
Rewrite of ATen code generator (#42629) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/42629 How to approach reviewing this diff: - The new codegen itself lives in `tools/codegen`. Start with `gen.py`, then read `model.py` and them the `api/` folder. The comments at the top of the files describe what is going on. The CLI interface of the new codegen is similar to the old one, but (1) it is no longer necessary to explicitly specify cwrap inputs (and now we will error if you do so) and (2) the default settings for source and install dir are much better; to the extent that if you run the codegen from the root source directory as just `python -m tools.codegen.gen`, something reasonable will happen. - The old codegen is (nearly) entirely deleted; every Python file in `aten/src/ATen` was deleted except for `common_with_cwrap.py`, which now permanently finds its home in `tools/shared/cwrap_common.py` (previously cmake copied the file there), and `code_template.py`, which now lives in `tools/codegen/code_template.py`. We remove the copying logic for `common_with_cwrap.py`. - All of the inputs to the old codegen are deleted. - Build rules now have to be adjusted to not refer to files that no longer exist, and to abide by the (slightly modified) CLI. - LegacyTHFunctions files have been generated and checked in. We expect these to be deleted as these final functions get ported to ATen. The deletion process is straightforward; just delete the functions of the ones you are porting. There are 39 more functions left to port. Signed-off-by: Edward Z. Yang <ezyang@fb.com> Test Plan: Imported from OSS Reviewed By: bhosmer Differential Revision: D23183978 Pulled By: ezyang fbshipit-source-id: 6073ba432ad182c7284a97147b05f0574a02f763
2020-08-31 15:58:32 +00:00
# TODO: refactor this into its own library (but how to make
# a binary based off of a module in a library?)
py_binary(
name = "gen",
Rewrite of ATen code generator (#42629) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/42629 How to approach reviewing this diff: - The new codegen itself lives in `tools/codegen`. Start with `gen.py`, then read `model.py` and them the `api/` folder. The comments at the top of the files describe what is going on. The CLI interface of the new codegen is similar to the old one, but (1) it is no longer necessary to explicitly specify cwrap inputs (and now we will error if you do so) and (2) the default settings for source and install dir are much better; to the extent that if you run the codegen from the root source directory as just `python -m tools.codegen.gen`, something reasonable will happen. - The old codegen is (nearly) entirely deleted; every Python file in `aten/src/ATen` was deleted except for `common_with_cwrap.py`, which now permanently finds its home in `tools/shared/cwrap_common.py` (previously cmake copied the file there), and `code_template.py`, which now lives in `tools/codegen/code_template.py`. We remove the copying logic for `common_with_cwrap.py`. - All of the inputs to the old codegen are deleted. - Build rules now have to be adjusted to not refer to files that no longer exist, and to abide by the (slightly modified) CLI. - LegacyTHFunctions files have been generated and checked in. We expect these to be deleted as these final functions get ported to ATen. The deletion process is straightforward; just delete the functions of the ones you are porting. There are 39 more functions left to port. Signed-off-by: Edward Z. Yang <ezyang@fb.com> Test Plan: Imported from OSS Reviewed By: bhosmer Differential Revision: D23183978 Pulled By: ezyang fbshipit-source-id: 6073ba432ad182c7284a97147b05f0574a02f763
2020-08-31 15:58:32 +00:00
srcs = ["tools/setup_helpers/gen.py"],
deps = [
":tools_codegen"
],
)
genrule(
name = "generated_cpp",
srcs = [
"aten/src/ATen/native/native_functions.yaml",
] + glob(["aten/src/ATen/templates/**"]),
outs = [
"aten/src/ATen/Declarations.yaml",
ATen DerivedType is dead, long live ATen RegisterDispatchKey (#47011) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/47011 smessmer has complained about how it is difficult to find generated code. Well hopefully this diffs helps a bit with that. There are three components to this refactor: - Rename TypeDerived (CPUType) to RegisterDispatchKey (RegisterCPU). The 'Type' nomenclature is vestigial and I think Register says what these files do a lot more clearly. I also got rid of the CPUType namespace; everything just goes in anonymous namespace now, less moving parts this way. - Give Math and DefaultBackend their own files (RegisterMath and RegisterDefaultBackend) - Restructure code generation so that schema definition is done completely separately from RegisterDispatchKey I decided to name the files RegisterCPU rather than the old convention BackendSelectRegister, because it seems better to me if these files clump together in an alphabetical listing rather than being spread out everywhere. There are a few manual registration files which should probably get similar renaming. I also did a little garden cleaning about how we identify if a dispatch key is a cuda key or a generic key (previously called KEYWORD_ALL_BACKENDS but I like my naming better). Signed-off-by: Edward Z. Yang <ezyang@fb.com> Differential Revision: D24600806 Test Plan: Imported from OSS Reviewed By: smessmer Pulled By: ezyang fbshipit-source-id: c1b510dd7515bd95e3ad25b8edf961b2fb30a25a
2020-11-12 17:51:21 +00:00
"aten/src/ATen/RegisterBackendSelect.cpp",
"aten/src/ATen/RegisterCPU.cpp",
Add a Functionalization pass in core (#64432) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64432 Original PR description + feedback here: https://github.com/pytorch/pytorch/pull/63048 I've addressed all of the feedback in the original PR and made some pretty large changes, listed below. **Table of Contents** - Starting points - List of the main changes from the original PR - Next Steps - Example codegen output (for a view, mutation, and view+mutation op) **Starting Points** A good place to start when looking through the PR: * Alban mentioned that this is a useful mental model (thanks Ed for originally making this clear to me). Semantically, the pass currently does THREE things, which are all needed by functorch - all fused together into one big pass. * (a) alias removal, which replaces {view} calls with {view}_copy calls, and manually tracks aliasing information, so that when one tensor is mutated, we re-apply the same mutation to all of the aliases. This is the bulk of the work - once this is done, the next 2 things are trivial to implement. * (b) mutation removal, which is easy to do once we know that there are no aliases. Every mutation `a.add_(b)` becomes `a.replace_(a.add(b))` * (c) reapplying views: all of the `{view}_copy` calls are replaced with `{view}` calls again. This is an optimization that we can make specifically for functorch (and strided backends), that only care about mutation removal and not alias removal * XLA and Vulkan only want (a), or (a) + (b). Later, we'll want to split this out so that you can actually opt into different versions of this logic. * There is currently no {view}_copy replacement, because the pass just <replace views with copies> and <replace copies with views> steps have been combined. Later, we'll want to actually implement {view}_copy variants of each view operator, probably with codegen. * documentation breadcrumb 1, in `FunctionalTensorWrapper.cpp`: https://github.com/pytorch/pytorch/pull/64432/files#diff-a0bac99bf205dba5b94cb64fc2466d3d55d991887572f9cd6a02e27b3a91dd60R59 (you might have to expand the `FunctionalTensorWrapper.cpp` file, which GitHub closes by default because it's large) * documentation breadcrumb 2, in `FunctionalTensorWrapper.h`: https://github.com/pytorch/pytorch/pull/64432/files#diff-c945c71a4ccac65871f24a912e8904f9a5088b24a32e636727ea9c8fe920708aR12 * Reading through the codegen output at the bottom of this description. **Main changes from the original PR** (1) I use lambdas instead of a giant enum to handle all of the different views. This results in less boilerplate per view op (and more stuff that can be codegen'd). Every `ViewMeta` object now contains a `forward` and `reverse` lambda, that knows how to replay the view and its inverse. This makes the actual code that executes the replaying logic a lot less boilerplate-y (see `Alias::sync_update_operations` and `FunctionalTensorWrapper::sync_`) (2) Every tensor during the functionalization pass is always wrapped in a `FunctionalTensorWrapper`. This is potentially unnecessary for Vulkan/XLA, and will have a mild perf impact, but for now this PR just targets the functorch use case. I previously had a complicated design a (`FunctionalTensorImplBase` class) to avoid needing the wrapper for XLA, but it had some subtleties that are gonna require more thought to fix, so I'm pushing that off for now. (3) `FunctionalTensorWrapper` objects accurately report stride information. It's a little annoying to do this though, because the logic that calculates stride info for each view isn't easily separated from the actual view kernels in core, `at::native::{view}`. I do this by adding logic in each `at::functionalization::{view}` kernel to call the reference implementation `at::native::{view}`. I don't do anything with the output aside from taking it's size/stride/storage_offset to set the actual output tensor's size/stride/storage_offset correctly. There's another annoying part to this: I'm pretty sure that we want to pass in the actual *wrapper* tensors directly into the native kernels, not their inner unwrapped values. But there are some `at::native::{view}` kernels that call other tensor methods, which re-invokes the dispatcher, calling functionalization/functorch kernels that try do the unwrapping. To do this, right now I have an `AutoDispatchDirectlyToNative` guard that basically ensures that any tensor methods called inside of the at::native::{view} op always redispatch straight to the CPU kernel (which will be another at::native:: kernel). This feels kind of heavy handed, but I'm not sure of a better way to do it. (4) `FunctionalTensorWrapper` objects accurately report aliasing information. There's a new `FunctionalStorageImpl` class (subclass of `StorageImpl`) that allows tensors in the functionalization pass to accurately alias storage. If two tensors `a` and `b` in a functionalized program are views of one another, then `a.storage.is_alias_of(b.storage)` should return true. I added this in a pretty similar way to how meta tensors allocate storage, although I don't pass in an actual allocator (I think this is fine because you should never resize a functional tensor's storage). One thing I'm not sure about - should `FunctionalTensorWrapper` set `storage_access_should_throw_`: (a) always, (b) never, (c) only if its wrapped tensor has it set. Right now I have it not set, mostly because calling the reference view functions (`at::native::{view}`) requires looking at the storage. But that means that if you try to access storage from python in a functionalized program, you'll get silent garbage instead of an error. Related question: are we planning on exposing meta tensor storage to python in the future (even though it contains garbage)? (5) better docs :) **View operator coverage** (6) The functionalization pass now gets math-composite view ops for free. I didn't add the `Functionalize` dispatch key to the composite set, because I don't want composite ops like `torch.ones` to get decomposed before hitting the functionalization pass. Instead, I added codegen to manually register the `at::native::` kernels of composite view ops. This is a little hairy, because the names of the `at::native::` kernels aren't easily accessible. They're stored in a `Dict[DispatchKey, BackendIndex]`. I made a best-effort attempt to get each view kernel's name, basically by assuming that every view op has either a composite or cpu implementation. There's also a hardcoded list of composite view ops in `gen_inplace_or_view_type.py`, but it looks like it's wrong. This is probably worth rationalizing later, but instead I created a new list of the "complete" set of composite view ops, and preserved the old set by hardcoding the delta between the two sets. (7) I've added codegen for ops that are both views AND mutations, like `transpose_()` (why do we even have these {emoji:1f622}). From some light testing, it looks like they work correctly with one caveat: I had a hard time ensuring that functorch programs that mutate their inputs using ops like `transpose_()` preserve the input mutations after the program finishes running. For (in my corresponding functorch branch) I emit a warning when this happens, and just don't preserve the mutation (8) I added `{view}_inverse` implementations for every view op, in `FunctionalInverses.cpp`. These are needed to take mutations made to views and replay them back onto the base. To reduce boilerplate, the codegen generates function declarations for each `{view}_inverse` function, so you get a nice compiler error when someone eventually adds a new view op. The only view ops currently not supported are (a) as_strided, and (b) the sparse view ops (values()/indices()). I can add support for as_strided, but it needs an `as_strided_inverse()` function. That will look really similar to the `as_strided_backward()` function in FunctionsManual.cpp, but it has some noticeable differences: we basically want an `as_strided_embed` for autograd and `as_strided_scatter` for functionalization. We also will probably need them to be primitives w.r.t to autograd, since the currently implementation for autograd uses view().copy_() calls that XLA won't be able to handle. I'm wondering if anyone has any objections, but otherwise I can make those change (which will require writing backward formulas for `as_strided_embed` and `as_strided_scatter`). I did a bunch of manual testing that all looks pretty good, but it's definitely not fully tested. Ed pointed out that once XLA uses this pass (or at least once there's a POC), we can just run the existing xla view test suite. Hopefully that delay is okay - if it's not, maybe we can think about using OpInfos similar to how functorch uses them for testing. Note: there's some duplication with autograd's view code. Every `{view}_inverse` implementation is really similar to the implementation for that view listed in `derivatives.yaml`. There are some major differences though: * the autograd implementations over those backwards functions (like `permute_backwards()`, in `FunctionsManual.cpp`) internally call other view ops. For functoinalization, we want them to (eventually call `{view}_copy` operators). * For view ops that take a subset of the original storage, like `slice/select/diagonal/as_strided()`, the autograd backward functions fill the "spaces" in the inverse call with zeroes. For functionalizations, we want to fill them with the value of `base` at those positions. It looks like this currently applies to 6 total ops (since we can ignore composites): * select * slice * diagonal * as_stridied * split * split_with_sizes A nice end state would probably be for the autograd + functoinalization codegen to both look at the same yaml (either `derivatives.yaml`, or something else), and automatically generate the right thing. I didn't leave that in scope for this PR though. **Current State + Next Steps** There are a bunch of followups after this PR eventually lands. Roughly in order: * Use the current pass to register problematic composite ops in functorch. Also, nested `functionalize()` calls aren't supported yet (I mostly just need to remove some debug asserts and test it). * Work on freeing up dispatch key space in the by deduplicating the `{backend}`/`Autograd{backend}`/`Sparse{backend}`/`Quantized{backend}` keys * Once we have more dispatch keys, split up this pass into 3 pieces - it's currently fused, and doesn't do the right thing for vulkan/XLA. Specifically, all of the `{view}` calls in the current pass's view-replay logic should turn into `{view}_copy` calls that vulkan/XLA know how to implement, and there will be separate passes for (a) removing mutations, and (b) turning `{view}_copy` calls back into `{view}` calls. For Vulkan, we eventually want a pass that ONLY removes aliasing and view calls, and doesn't remove mutations. We can also probably make the 2 new passes user dispatch keys to save dispatch key space, if they'll only be used by functorch anyway. * Do more of a dive on perf for the vulkan/xla use cases. There are several areas to improve perf with varying levels of effort required. The simplest one that I'll probably do regardless is to codegen the out-of-place kernels instead of using a boxed fallback. Getting a POC working for xla will also be useful to test the view operator coverage. **Example Codegen Output** View Op: ``` ::std::vector<at::Tensor> split_Tensor(c10::DispatchKeySet ks, const at::Tensor & self, int64_t split_size, int64_t dim) { auto self_ = at::functionalization::impl::unwrapFunctionalTensor(self); ::std::vector<at::Tensor> out; { at::AutoDispatchBelowFunctionalize guard; auto tmp_output = at::redispatch::split(ks & c10::after_func_keyset, self_, split_size, dim); out = at::functionalization::impl::wrapFunctionalTensor(tmp_output); // I'm fusing the [alias removal], [mutation removal], [add views back] passes together. // Later, we'll want to turn them into separate passes (since e.g. vulkan only cares about alias removal). } at::functionalization::ViewMeta view_meta = at::functionalization::ViewMeta( [split_size, dim](const at::Tensor& base, int64_t mutated_view_idx) -> at::Tensor { return base.split(split_size, dim)[mutated_view_idx]; }, [split_size, dim](const at::Tensor& base, const at::Tensor& mutated_view, int64_t mutated_view_idx) -> at::Tensor { return at::functionalization::impl::split_inverse(base, mutated_view, mutated_view_idx, split_size, dim); } ); at::functionalization::impl::set_view_meta(out, self, view_meta); at::AutoDispatchDirectlyToNative native_guard; ::std::vector<at::Tensor> reference_tensor_output = at::native::split(self, split_size, dim); at::functionalization::impl::set_strides(out, reference_tensor_output); return out; } ``` Mutation Op: ``` at::Tensor & add__Tensor(c10::DispatchKeySet ks, at::Tensor & self, const at::Tensor & other, const at::Scalar & alpha) { at::functionalization::impl::sync(self); at::functionalization::impl::sync(other); auto self_ = at::functionalization::impl::unwrapFunctionalTensor(self); auto other_ = at::functionalization::impl::unwrapFunctionalTensor(other); at::Tensor tmp_output; { at::AutoDispatchBelowFunctionalize guard; // The functionalization pass explicitly doesn't pass out= parameters to the redispatch tmp_output = at::redispatch::add( ks & c10::after_func_keyset, self_, other_, alpha); } self.replace_(tmp_output); at::functionalization::impl::maybe_add_update(self); return self; } ``` View + Mutation Op: ``` at::Tensor & transpose_(c10::DispatchKeySet ks, at::Tensor & self, int64_t dim0, int64_t dim1) { at::functionalization::ViewMeta view_meta = at::functionalization::ViewMeta( [dim0, dim1](const at::Tensor& base, int64_t mutated_view_idx) -> at::Tensor { return base.transpose(dim0, dim1); }, [dim0, dim1](const at::Tensor& base, const at::Tensor& mutated_view, int64_t mutated_view_idx) -> at::Tensor { return at::functionalization::impl::transpose_inverse(base, mutated_view, dim0, dim1); } ); at::functionalization::impl::mutate_view_meta(self, view_meta); // See Note [Propagating strides in the functionalization pass] // Directly update the sizes/strides/storage_offset fields on self using the inplace call. // I need the guard because I don't want the at::native kernel to end up calling more functionalization/functorch kernels. // Its only job is to directly compute the output size/stride/storage_offset metadata. at::AutoDispatchDirectlyToNative native_guard; at::native::transpose_(self, dim0, dim1); return self; } ``` Test Plan: Imported from OSS Reviewed By: albanD Differential Revision: D31942093 Pulled By: bdhirsh fbshipit-source-id: b95598dae35dd1842fa8b1d8d1448332f3afaadf
2021-10-28 17:43:11 +00:00
"aten/src/ATen/RegisterFunctionalization_0.cpp",
"aten/src/ATen/RegisterFunctionalization_1.cpp",
"aten/src/ATen/RegisterFunctionalization_2.cpp",
"aten/src/ATen/RegisterFunctionalization_3.cpp",
# "aten/src/ATen/RegisterFunctionalizationEverything.cpp",
ATen DerivedType is dead, long live ATen RegisterDispatchKey (#47011) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/47011 smessmer has complained about how it is difficult to find generated code. Well hopefully this diffs helps a bit with that. There are three components to this refactor: - Rename TypeDerived (CPUType) to RegisterDispatchKey (RegisterCPU). The 'Type' nomenclature is vestigial and I think Register says what these files do a lot more clearly. I also got rid of the CPUType namespace; everything just goes in anonymous namespace now, less moving parts this way. - Give Math and DefaultBackend their own files (RegisterMath and RegisterDefaultBackend) - Restructure code generation so that schema definition is done completely separately from RegisterDispatchKey I decided to name the files RegisterCPU rather than the old convention BackendSelectRegister, because it seems better to me if these files clump together in an alphabetical listing rather than being spread out everywhere. There are a few manual registration files which should probably get similar renaming. I also did a little garden cleaning about how we identify if a dispatch key is a cuda key or a generic key (previously called KEYWORD_ALL_BACKENDS but I like my naming better). Signed-off-by: Edward Z. Yang <ezyang@fb.com> Differential Revision: D24600806 Test Plan: Imported from OSS Reviewed By: smessmer Pulled By: ezyang fbshipit-source-id: c1b510dd7515bd95e3ad25b8edf961b2fb30a25a
2020-11-12 17:51:21 +00:00
"aten/src/ATen/RegisterMkldnnCPU.cpp",
"aten/src/ATen/RegisterQuantizedCPU.cpp",
"aten/src/ATen/RegisterSparseCPU.cpp",
"aten/src/ATen/RegisterSparseCsrCPU.cpp",
"aten/src/ATen/RegisterZeroTensor.cpp",
"aten/src/ATen/RegisterCompositeImplicitAutograd.cpp",
"aten/src/ATen/RegisterCompositeExplicitAutograd.cpp",
"aten/src/ATen/RegisterMeta.cpp",
ATen DerivedType is dead, long live ATen RegisterDispatchKey (#47011) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/47011 smessmer has complained about how it is difficult to find generated code. Well hopefully this diffs helps a bit with that. There are three components to this refactor: - Rename TypeDerived (CPUType) to RegisterDispatchKey (RegisterCPU). The 'Type' nomenclature is vestigial and I think Register says what these files do a lot more clearly. I also got rid of the CPUType namespace; everything just goes in anonymous namespace now, less moving parts this way. - Give Math and DefaultBackend their own files (RegisterMath and RegisterDefaultBackend) - Restructure code generation so that schema definition is done completely separately from RegisterDispatchKey I decided to name the files RegisterCPU rather than the old convention BackendSelectRegister, because it seems better to me if these files clump together in an alphabetical listing rather than being spread out everywhere. There are a few manual registration files which should probably get similar renaming. I also did a little garden cleaning about how we identify if a dispatch key is a cuda key or a generic key (previously called KEYWORD_ALL_BACKENDS but I like my naming better). Signed-off-by: Edward Z. Yang <ezyang@fb.com> Differential Revision: D24600806 Test Plan: Imported from OSS Reviewed By: smessmer Pulled By: ezyang fbshipit-source-id: c1b510dd7515bd95e3ad25b8edf961b2fb30a25a
2020-11-12 17:51:21 +00:00
"aten/src/ATen/RegisterSchema.cpp",
Add at::cpu namespace of functions for structured kernels (#49505) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/49505 I have a problem which is that static runtime needs a way to bypass dispatch and call into kernels directly. Previously, it used native:: bindings to do this; but these bindings no longer exist for structured kernels! Enter at::cpu: a namespace of exactly at:: compatible functions that assume all of their arguments are CPU and non-autograd! The header looks like this: ``` namespace at { namespace cpu { CAFFE2_API Tensor & add_out(Tensor & out, const Tensor & self, const Tensor & other, Scalar alpha=1); CAFFE2_API Tensor add(const Tensor & self, const Tensor & other, Scalar alpha=1); CAFFE2_API Tensor & add_(Tensor & self, const Tensor & other, Scalar alpha=1); CAFFE2_API Tensor & upsample_nearest1d_out(Tensor & out, const Tensor & self, IntArrayRef output_size, c10::optional<double> scales=c10::nullopt); CAFFE2_API Tensor upsample_nearest1d(const Tensor & self, IntArrayRef output_size, c10::optional<double> scales=c10::nullopt); CAFFE2_API Tensor & upsample_nearest1d_backward_out(Tensor & grad_input, const Tensor & grad_output, IntArrayRef output_size, IntArrayRef input_size, c10::optional<double> scales=c10::nullopt); CAFFE2_API Tensor upsample_nearest1d_backward(const Tensor & grad_output, IntArrayRef output_size, IntArrayRef input_size, c10::optional<double> scales=c10::nullopt); }} ``` This slows down static runtime because these are not the "allow resize of nonzero tensor" variant binding (unlike the ones I had manually written). We can restore this: it's a matter of adding codegen smarts to do this, but I haven't done it just yet since it's marginally more complicated. In principle, non-structured kernels could get this treatment too. But, like an evil mastermind, I'm withholding it from this patch, as an extra carrot to get people to migrate to structured muahahahaha. Signed-off-by: Edward Z. Yang <ezyang@fb.com> Test Plan: Imported from OSS Reviewed By: smessmer Differential Revision: D25616105 Pulled By: ezyang fbshipit-source-id: 84955ae09d0b373ca1ed05e0e4e0074a18d1a0b5
2021-01-22 21:09:34 +00:00
"aten/src/ATen/CPUFunctions.h",
"aten/src/ATen/CPUFunctions_inl.h",
Add at::cpu namespace of functions for structured kernels (#49505) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/49505 I have a problem which is that static runtime needs a way to bypass dispatch and call into kernels directly. Previously, it used native:: bindings to do this; but these bindings no longer exist for structured kernels! Enter at::cpu: a namespace of exactly at:: compatible functions that assume all of their arguments are CPU and non-autograd! The header looks like this: ``` namespace at { namespace cpu { CAFFE2_API Tensor & add_out(Tensor & out, const Tensor & self, const Tensor & other, Scalar alpha=1); CAFFE2_API Tensor add(const Tensor & self, const Tensor & other, Scalar alpha=1); CAFFE2_API Tensor & add_(Tensor & self, const Tensor & other, Scalar alpha=1); CAFFE2_API Tensor & upsample_nearest1d_out(Tensor & out, const Tensor & self, IntArrayRef output_size, c10::optional<double> scales=c10::nullopt); CAFFE2_API Tensor upsample_nearest1d(const Tensor & self, IntArrayRef output_size, c10::optional<double> scales=c10::nullopt); CAFFE2_API Tensor & upsample_nearest1d_backward_out(Tensor & grad_input, const Tensor & grad_output, IntArrayRef output_size, IntArrayRef input_size, c10::optional<double> scales=c10::nullopt); CAFFE2_API Tensor upsample_nearest1d_backward(const Tensor & grad_output, IntArrayRef output_size, IntArrayRef input_size, c10::optional<double> scales=c10::nullopt); }} ``` This slows down static runtime because these are not the "allow resize of nonzero tensor" variant binding (unlike the ones I had manually written). We can restore this: it's a matter of adding codegen smarts to do this, but I haven't done it just yet since it's marginally more complicated. In principle, non-structured kernels could get this treatment too. But, like an evil mastermind, I'm withholding it from this patch, as an extra carrot to get people to migrate to structured muahahahaha. Signed-off-by: Edward Z. Yang <ezyang@fb.com> Test Plan: Imported from OSS Reviewed By: smessmer Differential Revision: D25616105 Pulled By: ezyang fbshipit-source-id: 84955ae09d0b373ca1ed05e0e4e0074a18d1a0b5
2021-01-22 21:09:34 +00:00
"aten/src/ATen/CUDAFunctions.h",
"aten/src/ATen/CUDAFunctions_inl.h",
"aten/src/ATen/CompositeExplicitAutogradFunctions.h",
"aten/src/ATen/CompositeExplicitAutogradFunctions_inl.h",
"aten/src/ATen/CompositeImplicitAutogradFunctions.h",
"aten/src/ATen/CompositeImplicitAutogradFunctions_inl.h",
Add a Functionalization pass in core (#64432) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64432 Original PR description + feedback here: https://github.com/pytorch/pytorch/pull/63048 I've addressed all of the feedback in the original PR and made some pretty large changes, listed below. **Table of Contents** - Starting points - List of the main changes from the original PR - Next Steps - Example codegen output (for a view, mutation, and view+mutation op) **Starting Points** A good place to start when looking through the PR: * Alban mentioned that this is a useful mental model (thanks Ed for originally making this clear to me). Semantically, the pass currently does THREE things, which are all needed by functorch - all fused together into one big pass. * (a) alias removal, which replaces {view} calls with {view}_copy calls, and manually tracks aliasing information, so that when one tensor is mutated, we re-apply the same mutation to all of the aliases. This is the bulk of the work - once this is done, the next 2 things are trivial to implement. * (b) mutation removal, which is easy to do once we know that there are no aliases. Every mutation `a.add_(b)` becomes `a.replace_(a.add(b))` * (c) reapplying views: all of the `{view}_copy` calls are replaced with `{view}` calls again. This is an optimization that we can make specifically for functorch (and strided backends), that only care about mutation removal and not alias removal * XLA and Vulkan only want (a), or (a) + (b). Later, we'll want to split this out so that you can actually opt into different versions of this logic. * There is currently no {view}_copy replacement, because the pass just <replace views with copies> and <replace copies with views> steps have been combined. Later, we'll want to actually implement {view}_copy variants of each view operator, probably with codegen. * documentation breadcrumb 1, in `FunctionalTensorWrapper.cpp`: https://github.com/pytorch/pytorch/pull/64432/files#diff-a0bac99bf205dba5b94cb64fc2466d3d55d991887572f9cd6a02e27b3a91dd60R59 (you might have to expand the `FunctionalTensorWrapper.cpp` file, which GitHub closes by default because it's large) * documentation breadcrumb 2, in `FunctionalTensorWrapper.h`: https://github.com/pytorch/pytorch/pull/64432/files#diff-c945c71a4ccac65871f24a912e8904f9a5088b24a32e636727ea9c8fe920708aR12 * Reading through the codegen output at the bottom of this description. **Main changes from the original PR** (1) I use lambdas instead of a giant enum to handle all of the different views. This results in less boilerplate per view op (and more stuff that can be codegen'd). Every `ViewMeta` object now contains a `forward` and `reverse` lambda, that knows how to replay the view and its inverse. This makes the actual code that executes the replaying logic a lot less boilerplate-y (see `Alias::sync_update_operations` and `FunctionalTensorWrapper::sync_`) (2) Every tensor during the functionalization pass is always wrapped in a `FunctionalTensorWrapper`. This is potentially unnecessary for Vulkan/XLA, and will have a mild perf impact, but for now this PR just targets the functorch use case. I previously had a complicated design a (`FunctionalTensorImplBase` class) to avoid needing the wrapper for XLA, but it had some subtleties that are gonna require more thought to fix, so I'm pushing that off for now. (3) `FunctionalTensorWrapper` objects accurately report stride information. It's a little annoying to do this though, because the logic that calculates stride info for each view isn't easily separated from the actual view kernels in core, `at::native::{view}`. I do this by adding logic in each `at::functionalization::{view}` kernel to call the reference implementation `at::native::{view}`. I don't do anything with the output aside from taking it's size/stride/storage_offset to set the actual output tensor's size/stride/storage_offset correctly. There's another annoying part to this: I'm pretty sure that we want to pass in the actual *wrapper* tensors directly into the native kernels, not their inner unwrapped values. But there are some `at::native::{view}` kernels that call other tensor methods, which re-invokes the dispatcher, calling functionalization/functorch kernels that try do the unwrapping. To do this, right now I have an `AutoDispatchDirectlyToNative` guard that basically ensures that any tensor methods called inside of the at::native::{view} op always redispatch straight to the CPU kernel (which will be another at::native:: kernel). This feels kind of heavy handed, but I'm not sure of a better way to do it. (4) `FunctionalTensorWrapper` objects accurately report aliasing information. There's a new `FunctionalStorageImpl` class (subclass of `StorageImpl`) that allows tensors in the functionalization pass to accurately alias storage. If two tensors `a` and `b` in a functionalized program are views of one another, then `a.storage.is_alias_of(b.storage)` should return true. I added this in a pretty similar way to how meta tensors allocate storage, although I don't pass in an actual allocator (I think this is fine because you should never resize a functional tensor's storage). One thing I'm not sure about - should `FunctionalTensorWrapper` set `storage_access_should_throw_`: (a) always, (b) never, (c) only if its wrapped tensor has it set. Right now I have it not set, mostly because calling the reference view functions (`at::native::{view}`) requires looking at the storage. But that means that if you try to access storage from python in a functionalized program, you'll get silent garbage instead of an error. Related question: are we planning on exposing meta tensor storage to python in the future (even though it contains garbage)? (5) better docs :) **View operator coverage** (6) The functionalization pass now gets math-composite view ops for free. I didn't add the `Functionalize` dispatch key to the composite set, because I don't want composite ops like `torch.ones` to get decomposed before hitting the functionalization pass. Instead, I added codegen to manually register the `at::native::` kernels of composite view ops. This is a little hairy, because the names of the `at::native::` kernels aren't easily accessible. They're stored in a `Dict[DispatchKey, BackendIndex]`. I made a best-effort attempt to get each view kernel's name, basically by assuming that every view op has either a composite or cpu implementation. There's also a hardcoded list of composite view ops in `gen_inplace_or_view_type.py`, but it looks like it's wrong. This is probably worth rationalizing later, but instead I created a new list of the "complete" set of composite view ops, and preserved the old set by hardcoding the delta between the two sets. (7) I've added codegen for ops that are both views AND mutations, like `transpose_()` (why do we even have these {emoji:1f622}). From some light testing, it looks like they work correctly with one caveat: I had a hard time ensuring that functorch programs that mutate their inputs using ops like `transpose_()` preserve the input mutations after the program finishes running. For (in my corresponding functorch branch) I emit a warning when this happens, and just don't preserve the mutation (8) I added `{view}_inverse` implementations for every view op, in `FunctionalInverses.cpp`. These are needed to take mutations made to views and replay them back onto the base. To reduce boilerplate, the codegen generates function declarations for each `{view}_inverse` function, so you get a nice compiler error when someone eventually adds a new view op. The only view ops currently not supported are (a) as_strided, and (b) the sparse view ops (values()/indices()). I can add support for as_strided, but it needs an `as_strided_inverse()` function. That will look really similar to the `as_strided_backward()` function in FunctionsManual.cpp, but it has some noticeable differences: we basically want an `as_strided_embed` for autograd and `as_strided_scatter` for functionalization. We also will probably need them to be primitives w.r.t to autograd, since the currently implementation for autograd uses view().copy_() calls that XLA won't be able to handle. I'm wondering if anyone has any objections, but otherwise I can make those change (which will require writing backward formulas for `as_strided_embed` and `as_strided_scatter`). I did a bunch of manual testing that all looks pretty good, but it's definitely not fully tested. Ed pointed out that once XLA uses this pass (or at least once there's a POC), we can just run the existing xla view test suite. Hopefully that delay is okay - if it's not, maybe we can think about using OpInfos similar to how functorch uses them for testing. Note: there's some duplication with autograd's view code. Every `{view}_inverse` implementation is really similar to the implementation for that view listed in `derivatives.yaml`. There are some major differences though: * the autograd implementations over those backwards functions (like `permute_backwards()`, in `FunctionsManual.cpp`) internally call other view ops. For functoinalization, we want them to (eventually call `{view}_copy` operators). * For view ops that take a subset of the original storage, like `slice/select/diagonal/as_strided()`, the autograd backward functions fill the "spaces" in the inverse call with zeroes. For functionalizations, we want to fill them with the value of `base` at those positions. It looks like this currently applies to 6 total ops (since we can ignore composites): * select * slice * diagonal * as_stridied * split * split_with_sizes A nice end state would probably be for the autograd + functoinalization codegen to both look at the same yaml (either `derivatives.yaml`, or something else), and automatically generate the right thing. I didn't leave that in scope for this PR though. **Current State + Next Steps** There are a bunch of followups after this PR eventually lands. Roughly in order: * Use the current pass to register problematic composite ops in functorch. Also, nested `functionalize()` calls aren't supported yet (I mostly just need to remove some debug asserts and test it). * Work on freeing up dispatch key space in the by deduplicating the `{backend}`/`Autograd{backend}`/`Sparse{backend}`/`Quantized{backend}` keys * Once we have more dispatch keys, split up this pass into 3 pieces - it's currently fused, and doesn't do the right thing for vulkan/XLA. Specifically, all of the `{view}` calls in the current pass's view-replay logic should turn into `{view}_copy` calls that vulkan/XLA know how to implement, and there will be separate passes for (a) removing mutations, and (b) turning `{view}_copy` calls back into `{view}` calls. For Vulkan, we eventually want a pass that ONLY removes aliasing and view calls, and doesn't remove mutations. We can also probably make the 2 new passes user dispatch keys to save dispatch key space, if they'll only be used by functorch anyway. * Do more of a dive on perf for the vulkan/xla use cases. There are several areas to improve perf with varying levels of effort required. The simplest one that I'll probably do regardless is to codegen the out-of-place kernels instead of using a boxed fallback. Getting a POC working for xla will also be useful to test the view operator coverage. **Example Codegen Output** View Op: ``` ::std::vector<at::Tensor> split_Tensor(c10::DispatchKeySet ks, const at::Tensor & self, int64_t split_size, int64_t dim) { auto self_ = at::functionalization::impl::unwrapFunctionalTensor(self); ::std::vector<at::Tensor> out; { at::AutoDispatchBelowFunctionalize guard; auto tmp_output = at::redispatch::split(ks & c10::after_func_keyset, self_, split_size, dim); out = at::functionalization::impl::wrapFunctionalTensor(tmp_output); // I'm fusing the [alias removal], [mutation removal], [add views back] passes together. // Later, we'll want to turn them into separate passes (since e.g. vulkan only cares about alias removal). } at::functionalization::ViewMeta view_meta = at::functionalization::ViewMeta( [split_size, dim](const at::Tensor& base, int64_t mutated_view_idx) -> at::Tensor { return base.split(split_size, dim)[mutated_view_idx]; }, [split_size, dim](const at::Tensor& base, const at::Tensor& mutated_view, int64_t mutated_view_idx) -> at::Tensor { return at::functionalization::impl::split_inverse(base, mutated_view, mutated_view_idx, split_size, dim); } ); at::functionalization::impl::set_view_meta(out, self, view_meta); at::AutoDispatchDirectlyToNative native_guard; ::std::vector<at::Tensor> reference_tensor_output = at::native::split(self, split_size, dim); at::functionalization::impl::set_strides(out, reference_tensor_output); return out; } ``` Mutation Op: ``` at::Tensor & add__Tensor(c10::DispatchKeySet ks, at::Tensor & self, const at::Tensor & other, const at::Scalar & alpha) { at::functionalization::impl::sync(self); at::functionalization::impl::sync(other); auto self_ = at::functionalization::impl::unwrapFunctionalTensor(self); auto other_ = at::functionalization::impl::unwrapFunctionalTensor(other); at::Tensor tmp_output; { at::AutoDispatchBelowFunctionalize guard; // The functionalization pass explicitly doesn't pass out= parameters to the redispatch tmp_output = at::redispatch::add( ks & c10::after_func_keyset, self_, other_, alpha); } self.replace_(tmp_output); at::functionalization::impl::maybe_add_update(self); return self; } ``` View + Mutation Op: ``` at::Tensor & transpose_(c10::DispatchKeySet ks, at::Tensor & self, int64_t dim0, int64_t dim1) { at::functionalization::ViewMeta view_meta = at::functionalization::ViewMeta( [dim0, dim1](const at::Tensor& base, int64_t mutated_view_idx) -> at::Tensor { return base.transpose(dim0, dim1); }, [dim0, dim1](const at::Tensor& base, const at::Tensor& mutated_view, int64_t mutated_view_idx) -> at::Tensor { return at::functionalization::impl::transpose_inverse(base, mutated_view, dim0, dim1); } ); at::functionalization::impl::mutate_view_meta(self, view_meta); // See Note [Propagating strides in the functionalization pass] // Directly update the sizes/strides/storage_offset fields on self using the inplace call. // I need the guard because I don't want the at::native kernel to end up calling more functionalization/functorch kernels. // Its only job is to directly compute the output size/stride/storage_offset metadata. at::AutoDispatchDirectlyToNative native_guard; at::native::transpose_(self, dim0, dim1); return self; } ``` Test Plan: Imported from OSS Reviewed By: albanD Differential Revision: D31942093 Pulled By: bdhirsh fbshipit-source-id: b95598dae35dd1842fa8b1d8d1448332f3afaadf
2021-10-28 17:43:11 +00:00
"aten/src/ATen/FunctionalInverses.h",
"aten/src/ATen/Functions.h",
"aten/src/ATen/Functions.cpp",
"aten/src/ATen/RedispatchFunctions.h",
"aten/src/ATen/Operators.h",
"aten/src/ATen/Operators_0.cpp",
"aten/src/ATen/Operators_1.cpp",
"aten/src/ATen/Operators_2.cpp",
"aten/src/ATen/Operators_3.cpp",
"aten/src/ATen/Operators_4.cpp",
"aten/src/ATen/NativeFunctions.h",
Structured kernel definitions (#45277) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/45277 Implements structured kernels as per https://github.com/pytorch/rfcs/pull/9 and ports upsample_nearest1d to use the framework. The general structure of this diff: - Define a new syntax for specifying structured kernels in `native_functions.yaml`. You put `structured: True` on the `out` function (that's what you implement) and `structured_delegate: foo.out` on the functional/inplace variants to define them in terms of the `out` function. There's a bunch of new consistency checking to see if you've done this right, though the error messages are of varying quality. This is most of what's going on in tools.codegen.model - NativeFunctionGroup turns into StructuredNativeFunctions. Previously I thought that maybe we would use this grouping mechanism for both structured and unstructured kernels, but it turned out that Jiakai needed to make his own grouping structure. So now I've specialized it for structured kernels, which also means I get to add a bunch of invariants, like requiring structured kernels to have both a functional and an out variant. This is the lower bundle of changes in tools.codegen.model - When you make an out kernel structured, this induces us to generate a new meta function signature for you to write shape checking and output allocation code. The signatures of these is defined by `tools.codegen.api.meta` and generated into `MetaFunctions.h`. Coverage here is very bare bones and will be driven by actual operators we port as we go. - The meaty part of code generation is what we do when we have some grouped StructuredNativeFunctions. We continue to generate a wrapper per function type, but they're are a bit different as the call your meta functions, and make reference to the actual implementations in out. - Then there's a port of `upsample_nearest1d`; easiest to review by just looking at what the final code looks like. Missing pieces: - Stride calculation in TensorMeta - Sufficient sanity checking for inplace/out variants - Enough rope to make TensorIterator work This PR improves instruction counts on `upsample_nearest1d` because it eliminates an extra redispatch. Testing `at::upsample_nearest1d(x, {10});` * Functional: before 1314105, after 1150705 * Out: before 915705, after 838405 These numbers may be jittered up to +-16400 (which is the difference when I tested against an unaffected operator `at::upsample_linear1d`), though that may also because unrelated changes affected all operators globally. Signed-off-by: Edward Z. Yang <ezyang@fb.com> Differential Revision: D24253555 Test Plan: Imported from OSS Reviewed By: smessmer Pulled By: ezyang fbshipit-source-id: 4ef58dd911991060f13576864c8171f9cc614456
2020-11-17 23:23:03 +00:00
"aten/src/ATen/MetaFunctions.h",
"aten/src/ATen/MetaFunctions_inl.h",
generate C++ API for meta functions using at::meta:: (#58570) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/58570 **What the PR does** Generate a fast-path `at::meta::{op}` API for calling meta functions without having to go through the dispatcher. This will be important for perf for external backends that want to use meta functions for shape checking (which seems likely to be what we end up doing for LazyTensorCore). **Details** In order to avoid naming collisions I had to make two small changes: - rename `MetaFunctions.h` template -> `NativeMetaFunctions.h` (this is the file that declares the impl() function for every structured operator). - rename the meta class: `at::meta::{op}::meta()` -> `at::meta::structured_{op}::meta()` I also deleted a few unnecessary includes, since any file that includes NativeFunctions.h will automatically include NativeMetaFunctions.h. **Why I made the change** This change isn't actually immediately used anywhere; I already started writing it because I thought it would be useful for structured composite ops, but that isn't actually true (see [comment](https://github.com/pytorch/pytorch/pull/58266#issuecomment-843213147)). The change feels useful and unambiguous though so I think it's safe to add. I added explicit tests for C++ meta function calls just to ensure that I wrote it correctly - which is actually how I hit the internal linkage issue in the PR below this in the stack. Test Plan: Imported from OSS Reviewed By: pbelevich Differential Revision: D28711299 Pulled By: bdhirsh fbshipit-source-id: d410d17358c2b406f0191398093f17308b3c6b9e
2021-06-15 23:51:52 +00:00
"aten/src/ATen/NativeMetaFunctions.h",
"aten/src/ATen/core/TensorBody.h",
"aten/src/ATen/core/TensorMethods.cpp",
"aten/src/ATen/core/ATenOpList.cpp",
],
Rewrite of ATen code generator (#42629) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/42629 How to approach reviewing this diff: - The new codegen itself lives in `tools/codegen`. Start with `gen.py`, then read `model.py` and them the `api/` folder. The comments at the top of the files describe what is going on. The CLI interface of the new codegen is similar to the old one, but (1) it is no longer necessary to explicitly specify cwrap inputs (and now we will error if you do so) and (2) the default settings for source and install dir are much better; to the extent that if you run the codegen from the root source directory as just `python -m tools.codegen.gen`, something reasonable will happen. - The old codegen is (nearly) entirely deleted; every Python file in `aten/src/ATen` was deleted except for `common_with_cwrap.py`, which now permanently finds its home in `tools/shared/cwrap_common.py` (previously cmake copied the file there), and `code_template.py`, which now lives in `tools/codegen/code_template.py`. We remove the copying logic for `common_with_cwrap.py`. - All of the inputs to the old codegen are deleted. - Build rules now have to be adjusted to not refer to files that no longer exist, and to abide by the (slightly modified) CLI. - LegacyTHFunctions files have been generated and checked in. We expect these to be deleted as these final functions get ported to ATen. The deletion process is straightforward; just delete the functions of the ones you are porting. There are 39 more functions left to port. Signed-off-by: Edward Z. Yang <ezyang@fb.com> Test Plan: Imported from OSS Reviewed By: bhosmer Differential Revision: D23183978 Pulled By: ezyang fbshipit-source-id: 6073ba432ad182c7284a97147b05f0574a02f763
2020-08-31 15:58:32 +00:00
cmd = "$(location :gen) --source-path aten/src/ATen --install_dir `dirname $(location aten/src/ATen/Declarations.yaml)`",
tools = [":gen"],
)
py_library(
Rewrite of ATen code generator (#42629) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/42629 How to approach reviewing this diff: - The new codegen itself lives in `tools/codegen`. Start with `gen.py`, then read `model.py` and them the `api/` folder. The comments at the top of the files describe what is going on. The CLI interface of the new codegen is similar to the old one, but (1) it is no longer necessary to explicitly specify cwrap inputs (and now we will error if you do so) and (2) the default settings for source and install dir are much better; to the extent that if you run the codegen from the root source directory as just `python -m tools.codegen.gen`, something reasonable will happen. - The old codegen is (nearly) entirely deleted; every Python file in `aten/src/ATen` was deleted except for `common_with_cwrap.py`, which now permanently finds its home in `tools/shared/cwrap_common.py` (previously cmake copied the file there), and `code_template.py`, which now lives in `tools/codegen/code_template.py`. We remove the copying logic for `common_with_cwrap.py`. - All of the inputs to the old codegen are deleted. - Build rules now have to be adjusted to not refer to files that no longer exist, and to abide by the (slightly modified) CLI. - LegacyTHFunctions files have been generated and checked in. We expect these to be deleted as these final functions get ported to ATen. The deletion process is straightforward; just delete the functions of the ones you are porting. There are 39 more functions left to port. Signed-off-by: Edward Z. Yang <ezyang@fb.com> Test Plan: Imported from OSS Reviewed By: bhosmer Differential Revision: D23183978 Pulled By: ezyang fbshipit-source-id: 6073ba432ad182c7284a97147b05f0574a02f763
2020-08-31 15:58:32 +00:00
name = "tools_codegen",
srcs = glob(["tools/codegen/**/*.py"]),
)
py_library(
name = "tools_autograd",
srcs = glob(["tools/autograd/*.py"]),
data = glob([
"tools/autograd/*.yaml",
"tools/autograd/templates/*",
]),
Rewrite of ATen code generator (#42629) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/42629 How to approach reviewing this diff: - The new codegen itself lives in `tools/codegen`. Start with `gen.py`, then read `model.py` and them the `api/` folder. The comments at the top of the files describe what is going on. The CLI interface of the new codegen is similar to the old one, but (1) it is no longer necessary to explicitly specify cwrap inputs (and now we will error if you do so) and (2) the default settings for source and install dir are much better; to the extent that if you run the codegen from the root source directory as just `python -m tools.codegen.gen`, something reasonable will happen. - The old codegen is (nearly) entirely deleted; every Python file in `aten/src/ATen` was deleted except for `common_with_cwrap.py`, which now permanently finds its home in `tools/shared/cwrap_common.py` (previously cmake copied the file there), and `code_template.py`, which now lives in `tools/codegen/code_template.py`. We remove the copying logic for `common_with_cwrap.py`. - All of the inputs to the old codegen are deleted. - Build rules now have to be adjusted to not refer to files that no longer exist, and to abide by the (slightly modified) CLI. - LegacyTHFunctions files have been generated and checked in. We expect these to be deleted as these final functions get ported to ATen. The deletion process is straightforward; just delete the functions of the ones you are porting. There are 39 more functions left to port. Signed-off-by: Edward Z. Yang <ezyang@fb.com> Test Plan: Imported from OSS Reviewed By: bhosmer Differential Revision: D23183978 Pulled By: ezyang fbshipit-source-id: 6073ba432ad182c7284a97147b05f0574a02f763
2020-08-31 15:58:32 +00:00
deps = [":tools_codegen"],
)
py_library(
name = "tools_jit",
srcs = glob(["tools/jit/*.py"]),
data = glob(["tools/jit/templates/*"]),
)
py_binary(
name = "generate_code",
srcs = ["tools/setup_helpers/generate_code.py"],
deps = [
":tools_autograd",
":tools_jit",
],
)
libtorch_cpp_generated_sources = [
"torch/csrc/autograd/generated/VariableType.h",
"torch/csrc/autograd/generated/VariableType_0.cpp",
"torch/csrc/autograd/generated/VariableType_1.cpp",
"torch/csrc/autograd/generated/VariableType_2.cpp",
"torch/csrc/autograd/generated/VariableType_3.cpp",
"torch/csrc/autograd/generated/VariableType_4.cpp",
# "torch/csrc/autograd/generated/VariableTypeEverything.cpp",
[pytorch] move tracing logic to a separate dispatch backend (#38467) Summary: This PR moves tracing logic out of the generated VariableType kernels, to associate it with a new dedicated dispatch key Tracer. It also toggles the dispatch key set at various places to keep the semantics unchanged - see the inline [Tracing Mode Switches] note. Sample generated code: ``` Tensor & __ilshift___Tensor(Tensor & self, const Tensor & other) { #if !defined(PYTORCH_DISABLE_TRACING) torch::jit::Node* node = nullptr; std::shared_ptr<jit::tracer::TracingState> tracer_state; if (jit::tracer::isTracing()) { tracer_state = jit::tracer::getTracingState(); at::Symbol op_name; op_name = jit::Symbol::fromQualString("aten::__ilshift__"); node = tracer_state->graph->create(op_name, /*num_outputs=*/0); jit::tracer::recordSourceLocation(node); jit::tracer::addInputs(node, "self", self); jit::tracer::addInputs(node, "other", other); tracer_state->graph->insertNode(node); jit::tracer::setTracingState(nullptr); } #endif static auto op = c10::Dispatcher::singleton().findSchemaOrThrow("aten::__ilshift__", "Tensor"); c10::Dispatcher::singleton().redispatch<Tensor &, Tensor &, const Tensor &>(op, c10::DispatchKey::Tracer, self, other); #if !defined(PYTORCH_DISABLE_TRACING) if (tracer_state) { jit::tracer::setTracingState(std::move(tracer_state)); jit::tracer::addOutput(node, self); } #endif return self; } ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/38467 ghstack-source-id: 105215150 Test Plan: CI Differential Revision: D21570684 fbshipit-source-id: 1a96761830307f9a934f38bfb9fe8b5b1763e0e0
2020-06-04 08:49:27 +00:00
"torch/csrc/autograd/generated/TraceType_0.cpp",
"torch/csrc/autograd/generated/TraceType_1.cpp",
"torch/csrc/autograd/generated/TraceType_2.cpp",
"torch/csrc/autograd/generated/TraceType_3.cpp",
"torch/csrc/autograd/generated/TraceType_4.cpp",
# "torch/csrc/autograd/generated/TraceTypeEverything.cpp",
"torch/csrc/autograd/generated/ADInplaceOrViewType_0.cpp",
"torch/csrc/autograd/generated/ADInplaceOrViewType_1.cpp",
# "torch/csrc/autograd/generated/ADInplaceOrViewTypeEverything.cpp",
"torch/csrc/autograd/generated/Functions.h",
"torch/csrc/autograd/generated/Functions.cpp",
"torch/csrc/autograd/generated/variable_factories.h",
]
libtorch_python_generated_sources = [
"torch/csrc/autograd/generated/python_functions.h",
"torch/csrc/autograd/generated/python_functions_0.cpp",
"torch/csrc/autograd/generated/python_functions_1.cpp",
"torch/csrc/autograd/generated/python_functions_2.cpp",
"torch/csrc/autograd/generated/python_functions_3.cpp",
"torch/csrc/autograd/generated/python_functions_4.cpp",
"torch/csrc/autograd/generated/python_variable_methods.cpp",
"torch/csrc/autograd/generated/python_torch_functions_0.cpp",
"torch/csrc/autograd/generated/python_torch_functions_1.cpp",
"torch/csrc/autograd/generated/python_torch_functions_2.cpp",
"torch/csrc/autograd/generated/python_nn_functions.cpp",
Adds fft namespace (#41911) Summary: This PR creates a new namespace, torch.fft (torch::fft) and puts a single function, fft, in it. This function is analogous to is a simplified version of NumPy's [numpy.fft.fft](https://numpy.org/doc/1.18/reference/generated/numpy.fft.fft.html?highlight=fft#numpy.fft.fft) that accepts no optional arguments. It is intended to demonstrate how to add and document functions in the namespace, and is not intended to deprecate the existing torch.fft function. Adding this namespace was complicated by the existence of the torch.fft function in Python. Creating a torch.fft Python module makes this name ambiguous: does it refer to a function or module? If the JIT didn't exist, a solution to this problem would have been to make torch.fft refer to a callable class that mimicked both the function and module. The JIT, however, cannot understand this pattern. As a workaround it's required to explicitly `import torch.fft` to access the torch.fft.fft function in Python: ``` import torch.fft t = torch.randn(128, dtype=torch.cdouble) torch.fft.fft(t) ``` See https://github.com/pytorch/pytorch/issues/42175 for future work. Another possible future PR is to get the JIT to understand torch.fft as a callable class so it need not be imported explicitly to be used. Pull Request resolved: https://github.com/pytorch/pytorch/pull/41911 Reviewed By: glaringlee Differential Revision: D22941894 Pulled By: mruberry fbshipit-source-id: c8e0b44cbe90d21e998ca3832cf3a533f28dbe8d
2020-08-06 07:18:51 +00:00
"torch/csrc/autograd/generated/python_fft_functions.cpp",
"torch/csrc/autograd/generated/python_linalg_functions.cpp",
"torch/csrc/autograd/generated/python_sparse_functions.cpp",
"torch/csrc/autograd/generated/python_special_functions.cpp",
expose return_types in Python (#66614) Summary: https://github.com/facebookresearch/functorch/issues/87 TODO: * [x] Add comments * [x] Add test * [x] Fix XLA <details> <summary>Generated python_return_types.cpp</summary> ```cpp #include <Python.h> #include <vector> #include <map> #include <string> #include "torch/csrc/autograd/python_return_types.h" #include "torch/csrc/utils/structseq.h" #include "torch/csrc/Exceptions.h" namespace { PyTypeObject* get__det_lu_based_helper_namedtuple() { static PyStructSequence_Field NamedTuple_fields[] = { {"det", ""}, {"lu", ""}, {"pivs", ""}, {nullptr} }; static PyTypeObject _det_lu_based_helperNamedTuple; static bool is_initialized = false; static PyStructSequence_Desc desc = { "torch.return_types._det_lu_based_helper", nullptr, NamedTuple_fields, 3 }; if (!is_initialized) { PyStructSequence_InitType(&_det_lu_based_helperNamedTuple, &desc); _det_lu_based_helperNamedTuple.tp_repr = (reprfunc)torch::utils::returned_structseq_repr; is_initialized = true; } return &_det_lu_based_helperNamedTuple; } PyTypeObject* get__fake_quantize_per_tensor_affine_cachemask_tensor_qparams_namedtuple() { static PyStructSequence_Field NamedTuple_fields[] = { {"output", ""}, {"mask", ""}, {nullptr} }; static PyTypeObject _fake_quantize_per_tensor_affine_cachemask_tensor_qparamsNamedTuple; static bool is_initialized = false; static PyStructSequence_Desc desc = { "torch.return_types._fake_quantize_per_tensor_affine_cachemask_tensor_qparams", nullptr, NamedTuple_fields, 2 }; if (!is_initialized) { PyStructSequence_InitType(&_fake_quantize_per_tensor_affine_cachemask_tensor_qparamsNamedTuple, &desc); _fake_quantize_per_tensor_affine_cachemask_tensor_qparamsNamedTuple.tp_repr = (reprfunc)torch::utils::returned_structseq_repr; is_initialized = true; } return &_fake_quantize_per_tensor_affine_cachemask_tensor_qparamsNamedTuple; } PyTypeObject* get__fused_moving_avg_obs_fq_helper_namedtuple() { static PyStructSequence_Field NamedTuple_fields[] = { {"output", ""}, {"mask", ""}, {nullptr} }; static PyTypeObject _fused_moving_avg_obs_fq_helperNamedTuple; static bool is_initialized = false; static PyStructSequence_Desc desc = { "torch.return_types._fused_moving_avg_obs_fq_helper", nullptr, NamedTuple_fields, 2 }; if (!is_initialized) { PyStructSequence_InitType(&_fused_moving_avg_obs_fq_helperNamedTuple, &desc); _fused_moving_avg_obs_fq_helperNamedTuple.tp_repr = (reprfunc)torch::utils::returned_structseq_repr; is_initialized = true; } return &_fused_moving_avg_obs_fq_helperNamedTuple; } PyTypeObject* get__lu_with_info_namedtuple() { static PyStructSequence_Field NamedTuple_fields[] = { {"LU", ""}, {"pivots", ""}, {"info", ""}, {nullptr} }; static PyTypeObject _lu_with_infoNamedTuple; static bool is_initialized = false; static PyStructSequence_Desc desc = { "torch.return_types._lu_with_info", nullptr, NamedTuple_fields, 3 }; if (!is_initialized) { PyStructSequence_InitType(&_lu_with_infoNamedTuple, &desc); _lu_with_infoNamedTuple.tp_repr = (reprfunc)torch::utils::returned_structseq_repr; is_initialized = true; } return &_lu_with_infoNamedTuple; } PyTypeObject* get__unpack_dual_namedtuple() { static PyStructSequence_Field NamedTuple_fields[] = { {"primal", ""}, {"tangent", ""}, {nullptr} }; static PyTypeObject _unpack_dualNamedTuple; static bool is_initialized = false; static PyStructSequence_Desc desc = { "torch.return_types._unpack_dual", nullptr, NamedTuple_fields, 2 }; if (!is_initialized) { PyStructSequence_InitType(&_unpack_dualNamedTuple, &desc); _unpack_dualNamedTuple.tp_repr = (reprfunc)torch::utils::returned_structseq_repr; is_initialized = true; } return &_unpack_dualNamedTuple; } PyTypeObject* get_aminmax_namedtuple() { static PyStructSequence_Field NamedTuple_fields[] = { {"min", ""}, {"max", ""}, {nullptr} }; static PyTypeObject aminmaxNamedTuple; static bool is_initialized = false; static PyStructSequence_Desc desc = { "torch.return_types.aminmax", nullptr, NamedTuple_fields, 2 }; if (!is_initialized) { PyStructSequence_InitType(&aminmaxNamedTuple, &desc); aminmaxNamedTuple.tp_repr = (reprfunc)torch::utils::returned_structseq_repr; is_initialized = true; } return &aminmaxNamedTuple; } PyTypeObject* get_aminmax_out_namedtuple() { static PyStructSequence_Field NamedTuple_fields[] = { {"min", ""}, {"max", ""}, {nullptr} }; static PyTypeObject aminmax_outNamedTuple1; static bool is_initialized = false; static PyStructSequence_Desc desc = { "torch.return_types.aminmax_out", nullptr, NamedTuple_fields, 2 }; if (!is_initialized) { PyStructSequence_InitType(&aminmax_outNamedTuple1, &desc); aminmax_outNamedTuple1.tp_repr = (reprfunc)torch::utils::returned_structseq_repr; is_initialized = true; } return &aminmax_outNamedTuple1; } PyTypeObject* get_cummax_namedtuple() { static PyStructSequence_Field NamedTuple_fields[] = { {"values", ""}, {"indices", ""}, {nullptr} }; static PyTypeObject cummaxNamedTuple; static bool is_initialized = false; static PyStructSequence_Desc desc = { "torch.return_types.cummax", nullptr, NamedTuple_fields, 2 }; if (!is_initialized) { PyStructSequence_InitType(&cummaxNamedTuple, &desc); cummaxNamedTuple.tp_repr = (reprfunc)torch::utils::returned_structseq_repr; is_initialized = true; } return &cummaxNamedTuple; } PyTypeObject* get_cummax_out_namedtuple() { static PyStructSequence_Field NamedTuple_fields[] = { {"values", ""}, {"indices", ""}, {nullptr} }; static PyTypeObject cummax_outNamedTuple1; static bool is_initialized = false; static PyStructSequence_Desc desc = { "torch.return_types.cummax_out", nullptr, NamedTuple_fields, 2 }; if (!is_initialized) { PyStructSequence_InitType(&cummax_outNamedTuple1, &desc); cummax_outNamedTuple1.tp_repr = (reprfunc)torch::utils::returned_structseq_repr; is_initialized = true; } return &cummax_outNamedTuple1; } PyTypeObject* get_cummin_namedtuple() { static PyStructSequence_Field NamedTuple_fields[] = { {"values", ""}, {"indices", ""}, {nullptr} }; static PyTypeObject cumminNamedTuple; static bool is_initialized = false; static PyStructSequence_Desc desc = { "torch.return_types.cummin", nullptr, NamedTuple_fields, 2 }; if (!is_initialized) { PyStructSequence_InitType(&cumminNamedTuple, &desc); cumminNamedTuple.tp_repr = (reprfunc)torch::utils::returned_structseq_repr; is_initialized = true; } return &cumminNamedTuple; } PyTypeObject* get_cummin_out_namedtuple() { static PyStructSequence_Field NamedTuple_fields[] = { {"values", ""}, {"indices", ""}, {nullptr} }; static PyTypeObject cummin_outNamedTuple1; static bool is_initialized = false; static PyStructSequence_Desc desc = { "torch.return_types.cummin_out", nullptr, NamedTuple_fields, 2 }; if (!is_initialized) { PyStructSequence_InitType(&cummin_outNamedTuple1, &desc); cummin_outNamedTuple1.tp_repr = (reprfunc)torch::utils::returned_structseq_repr; is_initialized = true; } return &cummin_outNamedTuple1; } PyTypeObject* get_eig_out_namedtuple() { static PyStructSequence_Field NamedTuple_fields[] = { {"eigenvalues", ""}, {"eigenvectors", ""}, {nullptr} }; static PyTypeObject eig_outNamedTuple; static bool is_initialized = false; static PyStructSequence_Desc desc = { "torch.return_types.eig_out", nullptr, NamedTuple_fields, 2 }; if (!is_initialized) { PyStructSequence_InitType(&eig_outNamedTuple, &desc); eig_outNamedTuple.tp_repr = (reprfunc)torch::utils::returned_structseq_repr; is_initialized = true; } return &eig_outNamedTuple; } PyTypeObject* get_eig_namedtuple() { static PyStructSequence_Field NamedTuple_fields[] = { {"eigenvalues", ""}, {"eigenvectors", ""}, {nullptr} }; static PyTypeObject eigNamedTuple1; static bool is_initialized = false; static PyStructSequence_Desc desc = { "torch.return_types.eig", nullptr, NamedTuple_fields, 2 }; if (!is_initialized) { PyStructSequence_InitType(&eigNamedTuple1, &desc); eigNamedTuple1.tp_repr = (reprfunc)torch::utils::returned_structseq_repr; is_initialized = true; } return &eigNamedTuple1; } PyTypeObject* get_frexp_namedtuple() { static PyStructSequence_Field NamedTuple_fields[] = { {"mantissa", ""}, {"exponent", ""}, {nullptr} }; static PyTypeObject frexpNamedTuple; static bool is_initialized = false; static PyStructSequence_Desc desc = { "torch.return_types.frexp", nullptr, NamedTuple_fields, 2 }; if (!is_initialized) { PyStructSequence_InitType(&frexpNamedTuple, &desc); frexpNamedTuple.tp_repr = (reprfunc)torch::utils::returned_structseq_repr; is_initialized = true; } return &frexpNamedTuple; } PyTypeObject* get_frexp_out_namedtuple() { static PyStructSequence_Field NamedTuple_fields[] = { {"mantissa", ""}, {"exponent", ""}, {nullptr} }; static PyTypeObject frexp_outNamedTuple1; static bool is_initialized = false; static PyStructSequence_Desc desc = { "torch.return_types.frexp_out", nullptr, NamedTuple_fields, 2 }; if (!is_initialized) { PyStructSequence_InitType(&frexp_outNamedTuple1, &desc); frexp_outNamedTuple1.tp_repr = (reprfunc)torch::utils::returned_structseq_repr; is_initialized = true; } return &frexp_outNamedTuple1; } PyTypeObject* get_geqrf_out_namedtuple() { static PyStructSequence_Field NamedTuple_fields[] = { {"a", ""}, {"tau", ""}, {nullptr} }; static PyTypeObject geqrf_outNamedTuple; static bool is_initialized = false; static PyStructSequence_Desc desc = { "torch.return_types.geqrf_out", nullptr, NamedTuple_fields, 2 }; if (!is_initialized) { PyStructSequence_InitType(&geqrf_outNamedTuple, &desc); geqrf_outNamedTuple.tp_repr = (reprfunc)torch::utils::returned_structseq_repr; is_initialized = true; } return &geqrf_outNamedTuple; } PyTypeObject* get_geqrf_namedtuple() { static PyStructSequence_Field NamedTuple_fields[] = { {"a", ""}, {"tau", ""}, {nullptr} }; static PyTypeObject geqrfNamedTuple1; static bool is_initialized = false; static PyStructSequence_Desc desc = { "torch.return_types.geqrf", nullptr, NamedTuple_fields, 2 }; if (!is_initialized) { PyStructSequence_InitType(&geqrfNamedTuple1, &desc); geqrfNamedTuple1.tp_repr = (reprfunc)torch::utils::returned_structseq_repr; is_initialized = true; } return &geqrfNamedTuple1; } PyTypeObject* get_histogram_out_namedtuple() { static PyStructSequence_Field NamedTuple_fields[] = { {"hist", ""}, {"bin_edges", ""}, {nullptr} }; static PyTypeObject histogram_outNamedTuple; static bool is_initialized = false; static PyStructSequence_Desc desc = { "torch.return_types.histogram_out", nullptr, NamedTuple_fields, 2 }; if (!is_initialized) { PyStructSequence_InitType(&histogram_outNamedTuple, &desc); histogram_outNamedTuple.tp_repr = (reprfunc)torch::utils::returned_structseq_repr; is_initialized = true; } return &histogram_outNamedTuple; } PyTypeObject* get_histogram_namedtuple() { static PyStructSequence_Field NamedTuple_fields[] = { {"hist", ""}, {"bin_edges", ""}, {nullptr} }; static PyTypeObject histogramNamedTuple1; static bool is_initialized = false; static PyStructSequence_Desc desc = { "torch.return_types.histogram", nullptr, NamedTuple_fields, 2 }; if (!is_initialized) { PyStructSequence_InitType(&histogramNamedTuple1, &desc); histogramNamedTuple1.tp_repr = (reprfunc)torch::utils::returned_structseq_repr; is_initialized = true; } return &histogramNamedTuple1; } PyTypeObject* get_kthvalue_namedtuple() { static PyStructSequence_Field NamedTuple_fields[] = { {"values", ""}, {"indices", ""}, {nullptr} }; static PyTypeObject kthvalueNamedTuple; static bool is_initialized = false; static PyStructSequence_Desc desc = { "torch.return_types.kthvalue", nullptr, NamedTuple_fields, 2 }; if (!is_initialized) { PyStructSequence_InitType(&kthvalueNamedTuple, &desc); kthvalueNamedTuple.tp_repr = (reprfunc)torch::utils::returned_structseq_repr; is_initialized = true; } return &kthvalueNamedTuple; } PyTypeObject* get_kthvalue_out_namedtuple() { static PyStructSequence_Field NamedTuple_fields[] = { {"values", ""}, {"indices", ""}, {nullptr} }; static PyTypeObject kthvalue_outNamedTuple1; static bool is_initialized = false; static PyStructSequence_Desc desc = { "torch.return_types.kthvalue_out", nullptr, NamedTuple_fields, 2 }; if (!is_initialized) { PyStructSequence_InitType(&kthvalue_outNamedTuple1, &desc); kthvalue_outNamedTuple1.tp_repr = (reprfunc)torch::utils::returned_structseq_repr; is_initialized = true; } return &kthvalue_outNamedTuple1; } PyTypeObject* get_linalg_cholesky_ex_namedtuple() { static PyStructSequence_Field NamedTuple_fields[] = { {"L", ""}, {"info", ""}, {nullptr} }; static PyTypeObject linalg_cholesky_exNamedTuple; static bool is_initialized = false; static PyStructSequence_Desc desc = { "torch.return_types.linalg_cholesky_ex", nullptr, NamedTuple_fields, 2 }; if (!is_initialized) { PyStructSequence_InitType(&linalg_cholesky_exNamedTuple, &desc); linalg_cholesky_exNamedTuple.tp_repr = (reprfunc)torch::utils::returned_structseq_repr; is_initialized = true; } return &linalg_cholesky_exNamedTuple; } PyTypeObject* get_linalg_cholesky_ex_out_namedtuple() { static PyStructSequence_Field NamedTuple_fields[] = { {"L", ""}, {"info", ""}, {nullptr} }; static PyTypeObject linalg_cholesky_ex_outNamedTuple1; static bool is_initialized = false; static PyStructSequence_Desc desc = { "torch.return_types.linalg_cholesky_ex_out", nullptr, NamedTuple_fields, 2 }; if (!is_initialized) { PyStructSequence_InitType(&linalg_cholesky_ex_outNamedTuple1, &desc); linalg_cholesky_ex_outNamedTuple1.tp_repr = (reprfunc)torch::utils::returned_structseq_repr; is_initialized = true; } return &linalg_cholesky_ex_outNamedTuple1; } PyTypeObject* get_linalg_eig_namedtuple() { static PyStructSequence_Field NamedTuple_fields[] = { {"eigenvalues", ""}, {"eigenvectors", ""}, {nullptr} }; static PyTypeObject linalg_eigNamedTuple; static bool is_initialized = false; static PyStructSequence_Desc desc = { "torch.return_types.linalg_eig", nullptr, NamedTuple_fields, 2 }; if (!is_initialized) { PyStructSequence_InitType(&linalg_eigNamedTuple, &desc); linalg_eigNamedTuple.tp_repr = (reprfunc)torch::utils::returned_structseq_repr; is_initialized = true; } return &linalg_eigNamedTuple; } PyTypeObject* get_linalg_eig_out_namedtuple() { static PyStructSequence_Field NamedTuple_fields[] = { {"eigenvalues", ""}, {"eigenvectors", ""}, {nullptr} }; static PyTypeObject linalg_eig_outNamedTuple1; static bool is_initialized = false; static PyStructSequence_Desc desc = { "torch.return_types.linalg_eig_out", nullptr, NamedTuple_fields, 2 }; if (!is_initialized) { PyStructSequence_InitType(&linalg_eig_outNamedTuple1, &desc); linalg_eig_outNamedTuple1.tp_repr = (reprfunc)torch::utils::returned_structseq_repr; is_initialized = true; } return &linalg_eig_outNamedTuple1; } PyTypeObject* get_linalg_eigh_namedtuple() { static PyStructSequence_Field NamedTuple_fields[] = { {"eigenvalues", ""}, {"eigenvectors", ""}, {nullptr} }; static PyTypeObject linalg_eighNamedTuple; static bool is_initialized = false; static PyStructSequence_Desc desc = { "torch.return_types.linalg_eigh", nullptr, NamedTuple_fields, 2 }; if (!is_initialized) { PyStructSequence_InitType(&linalg_eighNamedTuple, &desc); linalg_eighNamedTuple.tp_repr = (reprfunc)torch::utils::returned_structseq_repr; is_initialized = true; } return &linalg_eighNamedTuple; } PyTypeObject* get_linalg_eigh_out_namedtuple() { static PyStructSequence_Field NamedTuple_fields[] = { {"eigenvalues", ""}, {"eigenvectors", ""}, {nullptr} }; static PyTypeObject linalg_eigh_outNamedTuple1; static bool is_initialized = false; static PyStructSequence_Desc desc = { "torch.return_types.linalg_eigh_out", nullptr, NamedTuple_fields, 2 }; if (!is_initialized) { PyStructSequence_InitType(&linalg_eigh_outNamedTuple1, &desc); linalg_eigh_outNamedTuple1.tp_repr = (reprfunc)torch::utils::returned_structseq_repr; is_initialized = true; } return &linalg_eigh_outNamedTuple1; } PyTypeObject* get_linalg_inv_ex_namedtuple() { static PyStructSequence_Field NamedTuple_fields[] = { {"inverse", ""}, {"info", ""}, {nullptr} }; static PyTypeObject linalg_inv_exNamedTuple; static bool is_initialized = false; static PyStructSequence_Desc desc = { "torch.return_types.linalg_inv_ex", nullptr, NamedTuple_fields, 2 }; if (!is_initialized) { PyStructSequence_InitType(&linalg_inv_exNamedTuple, &desc); linalg_inv_exNamedTuple.tp_repr = (reprfunc)torch::utils::returned_structseq_repr; is_initialized = true; } return &linalg_inv_exNamedTuple; } PyTypeObject* get_linalg_inv_ex_out_namedtuple() { static PyStructSequence_Field NamedTuple_fields[] = { {"inverse", ""}, {"info", ""}, {nullptr} }; static PyTypeObject linalg_inv_ex_outNamedTuple1; static bool is_initialized = false; static PyStructSequence_Desc desc = { "torch.return_types.linalg_inv_ex_out", nullptr, NamedTuple_fields, 2 }; if (!is_initialized) { PyStructSequence_InitType(&linalg_inv_ex_outNamedTuple1, &desc); linalg_inv_ex_outNamedTuple1.tp_repr = (reprfunc)torch::utils::returned_structseq_repr; is_initialized = true; } return &linalg_inv_ex_outNamedTuple1; } PyTypeObject* get_linalg_lstsq_namedtuple() { static PyStructSequence_Field NamedTuple_fields[] = { {"solution", ""}, {"residuals", ""}, {"rank", ""}, {"singular_values", ""}, {nullptr} }; static PyTypeObject linalg_lstsqNamedTuple; static bool is_initialized = false; static PyStructSequence_Desc desc = { "torch.return_types.linalg_lstsq", nullptr, NamedTuple_fields, 4 }; if (!is_initialized) { PyStructSequence_InitType(&linalg_lstsqNamedTuple, &desc); linalg_lstsqNamedTuple.tp_repr = (reprfunc)torch::utils::returned_structseq_repr; is_initialized = true; } return &linalg_lstsqNamedTuple; } PyTypeObject* get_linalg_lstsq_out_namedtuple() { static PyStructSequence_Field NamedTuple_fields[] = { {"solution", ""}, {"residuals", ""}, {"rank", ""}, {"singular_values", ""}, {nullptr} }; static PyTypeObject linalg_lstsq_outNamedTuple1; static bool is_initialized = false; static PyStructSequence_Desc desc = { "torch.return_types.linalg_lstsq_out", nullptr, NamedTuple_fields, 4 }; if (!is_initialized) { PyStructSequence_InitType(&linalg_lstsq_outNamedTuple1, &desc); linalg_lstsq_outNamedTuple1.tp_repr = (reprfunc)torch::utils::returned_structseq_repr; is_initialized = true; } return &linalg_lstsq_outNamedTuple1; } PyTypeObject* get_linalg_qr_namedtuple() { static PyStructSequence_Field NamedTuple_fields[] = { {"Q", ""}, {"R", ""}, {nullptr} }; static PyTypeObject linalg_qrNamedTuple; static bool is_initialized = false; static PyStructSequence_Desc desc = { "torch.return_types.linalg_qr", nullptr, NamedTuple_fields, 2 }; if (!is_initialized) { PyStructSequence_InitType(&linalg_qrNamedTuple, &desc); linalg_qrNamedTuple.tp_repr = (reprfunc)torch::utils::returned_structseq_repr; is_initialized = true; } return &linalg_qrNamedTuple; } PyTypeObject* get_linalg_qr_out_namedtuple() { static PyStructSequence_Field NamedTuple_fields[] = { {"Q", ""}, {"R", ""}, {nullptr} }; static PyTypeObject linalg_qr_outNamedTuple1; static bool is_initialized = false; static PyStructSequence_Desc desc = { "torch.return_types.linalg_qr_out", nullptr, NamedTuple_fields, 2 }; if (!is_initialized) { PyStructSequence_InitType(&linalg_qr_outNamedTuple1, &desc); linalg_qr_outNamedTuple1.tp_repr = (reprfunc)torch::utils::returned_structseq_repr; is_initialized = true; } return &linalg_qr_outNamedTuple1; } PyTypeObject* get_linalg_slogdet_namedtuple() { static PyStructSequence_Field NamedTuple_fields[] = { {"sign", ""}, {"logabsdet", ""}, {nullptr} }; static PyTypeObject linalg_slogdetNamedTuple; static bool is_initialized = false; static PyStructSequence_Desc desc = { "torch.return_types.linalg_slogdet", nullptr, NamedTuple_fields, 2 }; if (!is_initialized) { PyStructSequence_InitType(&linalg_slogdetNamedTuple, &desc); linalg_slogdetNamedTuple.tp_repr = (reprfunc)torch::utils::returned_structseq_repr; is_initialized = true; } return &linalg_slogdetNamedTuple; } PyTypeObject* get_linalg_slogdet_out_namedtuple() { static PyStructSequence_Field NamedTuple_fields[] = { {"sign", ""}, {"logabsdet", ""}, {nullptr} }; static PyTypeObject linalg_slogdet_outNamedTuple1; static bool is_initialized = false; static PyStructSequence_Desc desc = { "torch.return_types.linalg_slogdet_out", nullptr, NamedTuple_fields, 2 }; if (!is_initialized) { PyStructSequence_InitType(&linalg_slogdet_outNamedTuple1, &desc); linalg_slogdet_outNamedTuple1.tp_repr = (reprfunc)torch::utils::returned_structseq_repr; is_initialized = true; } return &linalg_slogdet_outNamedTuple1; } PyTypeObject* get_linalg_svd_out_namedtuple() { static PyStructSequence_Field NamedTuple_fields[] = { {"U", ""}, {"S", ""}, {"Vh", ""}, {nullptr} }; static PyTypeObject linalg_svd_outNamedTuple; static bool is_initialized = false; static PyStructSequence_Desc desc = { "torch.return_types.linalg_svd_out", nullptr, NamedTuple_fields, 3 }; if (!is_initialized) { PyStructSequence_InitType(&linalg_svd_outNamedTuple, &desc); linalg_svd_outNamedTuple.tp_repr = (reprfunc)torch::utils::returned_structseq_repr; is_initialized = true; } return &linalg_svd_outNamedTuple; } PyTypeObject* get_linalg_svd_namedtuple() { static PyStructSequence_Field NamedTuple_fields[] = { {"U", ""}, {"S", ""}, {"Vh", ""}, {nullptr} }; static PyTypeObject linalg_svdNamedTuple1; static bool is_initialized = false; static PyStructSequence_Desc desc = { "torch.return_types.linalg_svd", nullptr, NamedTuple_fields, 3 }; if (!is_initialized) { PyStructSequence_InitType(&linalg_svdNamedTuple1, &desc); linalg_svdNamedTuple1.tp_repr = (reprfunc)torch::utils::returned_structseq_repr; is_initialized = true; } return &linalg_svdNamedTuple1; } PyTypeObject* get_lstsq_out_namedtuple() { static PyStructSequence_Field NamedTuple_fields[] = { {"solution", ""}, {"QR", ""}, {nullptr} }; static PyTypeObject lstsq_outNamedTuple; static bool is_initialized = false; static PyStructSequence_Desc desc = { "torch.return_types.lstsq_out", nullptr, NamedTuple_fields, 2 }; if (!is_initialized) { PyStructSequence_InitType(&lstsq_outNamedTuple, &desc); lstsq_outNamedTuple.tp_repr = (reprfunc)torch::utils::returned_structseq_repr; is_initialized = true; } return &lstsq_outNamedTuple; } PyTypeObject* get_lstsq_namedtuple() { static PyStructSequence_Field NamedTuple_fields[] = { {"solution", ""}, {"QR", ""}, {nullptr} }; static PyTypeObject lstsqNamedTuple1; static bool is_initialized = false; static PyStructSequence_Desc desc = { "torch.return_types.lstsq", nullptr, NamedTuple_fields, 2 }; if (!is_initialized) { PyStructSequence_InitType(&lstsqNamedTuple1, &desc); lstsqNamedTuple1.tp_repr = (reprfunc)torch::utils::returned_structseq_repr; is_initialized = true; } return &lstsqNamedTuple1; } PyTypeObject* get_lu_unpack_namedtuple() { static PyStructSequence_Field NamedTuple_fields[] = { {"P", ""}, {"L", ""}, {"U", ""}, {nullptr} }; static PyTypeObject lu_unpackNamedTuple; static bool is_initialized = false; static PyStructSequence_Desc desc = { "torch.return_types.lu_unpack", nullptr, NamedTuple_fields, 3 }; if (!is_initialized) { PyStructSequence_InitType(&lu_unpackNamedTuple, &desc); lu_unpackNamedTuple.tp_repr = (reprfunc)torch::utils::returned_structseq_repr; is_initialized = true; } return &lu_unpackNamedTuple; } PyTypeObject* get_lu_unpack_out_namedtuple() { static PyStructSequence_Field NamedTuple_fields[] = { {"P", ""}, {"L", ""}, {"U", ""}, {nullptr} }; static PyTypeObject lu_unpack_outNamedTuple1; static bool is_initialized = false; static PyStructSequence_Desc desc = { "torch.return_types.lu_unpack_out", nullptr, NamedTuple_fields, 3 }; if (!is_initialized) { PyStructSequence_InitType(&lu_unpack_outNamedTuple1, &desc); lu_unpack_outNamedTuple1.tp_repr = (reprfunc)torch::utils::returned_structseq_repr; is_initialized = true; } return &lu_unpack_outNamedTuple1; } PyTypeObject* get_max_namedtuple() { static PyStructSequence_Field NamedTuple_fields[] = { {"values", ""}, {"indices", ""}, {nullptr} }; static PyTypeObject maxNamedTuple; static bool is_initialized = false; static PyStructSequence_Desc desc = { "torch.return_types.max", nullptr, NamedTuple_fields, 2 }; if (!is_initialized) { PyStructSequence_InitType(&maxNamedTuple, &desc); maxNamedTuple.tp_repr = (reprfunc)torch::utils::returned_structseq_repr; is_initialized = true; } return &maxNamedTuple; } PyTypeObject* get_max_out_namedtuple() { static PyStructSequence_Field NamedTuple_fields[] = { {"values", ""}, {"indices", ""}, {nullptr} }; static PyTypeObject max_outNamedTuple1; static bool is_initialized = false; static PyStructSequence_Desc desc = { "torch.return_types.max_out", nullptr, NamedTuple_fields, 2 }; if (!is_initialized) { PyStructSequence_InitType(&max_outNamedTuple1, &desc); max_outNamedTuple1.tp_repr = (reprfunc)torch::utils::returned_structseq_repr; is_initialized = true; } return &max_outNamedTuple1; } PyTypeObject* get_median_namedtuple() { static PyStructSequence_Field NamedTuple_fields[] = { {"values", ""}, {"indices", ""}, {nullptr} }; static PyTypeObject medianNamedTuple; static bool is_initialized = false; static PyStructSequence_Desc desc = { "torch.return_types.median", nullptr, NamedTuple_fields, 2 }; if (!is_initialized) { PyStructSequence_InitType(&medianNamedTuple, &desc); medianNamedTuple.tp_repr = (reprfunc)torch::utils::returned_structseq_repr; is_initialized = true; } return &medianNamedTuple; } PyTypeObject* get_median_out_namedtuple() { static PyStructSequence_Field NamedTuple_fields[] = { {"values", ""}, {"indices", ""}, {nullptr} }; static PyTypeObject median_outNamedTuple1; static bool is_initialized = false; static PyStructSequence_Desc desc = { "torch.return_types.median_out", nullptr, NamedTuple_fields, 2 }; if (!is_initialized) { PyStructSequence_InitType(&median_outNamedTuple1, &desc); median_outNamedTuple1.tp_repr = (reprfunc)torch::utils::returned_structseq_repr; is_initialized = true; } return &median_outNamedTuple1; } PyTypeObject* get_min_namedtuple() { static PyStructSequence_Field NamedTuple_fields[] = { {"values", ""}, {"indices", ""}, {nullptr} }; static PyTypeObject minNamedTuple; static bool is_initialized = false; static PyStructSequence_Desc desc = { "torch.return_types.min", nullptr, NamedTuple_fields, 2 }; if (!is_initialized) { PyStructSequence_InitType(&minNamedTuple, &desc); minNamedTuple.tp_repr = (reprfunc)torch::utils::returned_structseq_repr; is_initialized = true; } return &minNamedTuple; } PyTypeObject* get_min_out_namedtuple() { static PyStructSequence_Field NamedTuple_fields[] = { {"values", ""}, {"indices", ""}, {nullptr} }; static PyTypeObject min_outNamedTuple1; static bool is_initialized = false; static PyStructSequence_Desc desc = { "torch.return_types.min_out", nullptr, NamedTuple_fields, 2 }; if (!is_initialized) { PyStructSequence_InitType(&min_outNamedTuple1, &desc); min_outNamedTuple1.tp_repr = (reprfunc)torch::utils::returned_structseq_repr; is_initialized = true; } return &min_outNamedTuple1; } PyTypeObject* get_mode_namedtuple() { static PyStructSequence_Field NamedTuple_fields[] = { {"values", ""}, {"indices", ""}, {nullptr} }; static PyTypeObject modeNamedTuple; static bool is_initialized = false; static PyStructSequence_Desc desc = { "torch.return_types.mode", nullptr, NamedTuple_fields, 2 }; if (!is_initialized) { PyStructSequence_InitType(&modeNamedTuple, &desc); modeNamedTuple.tp_repr = (reprfunc)torch::utils::returned_structseq_repr; is_initialized = true; } return &modeNamedTuple; } PyTypeObject* get_mode_out_namedtuple() { static PyStructSequence_Field NamedTuple_fields[] = { {"values", ""}, {"indices", ""}, {nullptr} }; static PyTypeObject mode_outNamedTuple1; static bool is_initialized = false; static PyStructSequence_Desc desc = { "torch.return_types.mode_out", nullptr, NamedTuple_fields, 2 }; if (!is_initialized) { PyStructSequence_InitType(&mode_outNamedTuple1, &desc); mode_outNamedTuple1.tp_repr = (reprfunc)torch::utils::returned_structseq_repr; is_initialized = true; } return &mode_outNamedTuple1; } PyTypeObject* get_nanmedian_namedtuple() { static PyStructSequence_Field NamedTuple_fields[] = { {"values", ""}, {"indices", ""}, {nullptr} }; static PyTypeObject nanmedianNamedTuple; static bool is_initialized = false; static PyStructSequence_Desc desc = { "torch.return_types.nanmedian", nullptr, NamedTuple_fields, 2 }; if (!is_initialized) { PyStructSequence_InitType(&nanmedianNamedTuple, &desc); nanmedianNamedTuple.tp_repr = (reprfunc)torch::utils::returned_structseq_repr; is_initialized = true; } return &nanmedianNamedTuple; } PyTypeObject* get_nanmedian_out_namedtuple() { static PyStructSequence_Field NamedTuple_fields[] = { {"values", ""}, {"indices", ""}, {nullptr} }; static PyTypeObject nanmedian_outNamedTuple1; static bool is_initialized = false; static PyStructSequence_Desc desc = { "torch.return_types.nanmedian_out", nullptr, NamedTuple_fields, 2 }; if (!is_initialized) { PyStructSequence_InitType(&nanmedian_outNamedTuple1, &desc); nanmedian_outNamedTuple1.tp_repr = (reprfunc)torch::utils::returned_structseq_repr; is_initialized = true; } return &nanmedian_outNamedTuple1; } PyTypeObject* get_qr_out_namedtuple() { static PyStructSequence_Field NamedTuple_fields[] = { {"Q", ""}, {"R", ""}, {nullptr} }; static PyTypeObject qr_outNamedTuple; static bool is_initialized = false; static PyStructSequence_Desc desc = { "torch.return_types.qr_out", nullptr, NamedTuple_fields, 2 }; if (!is_initialized) { PyStructSequence_InitType(&qr_outNamedTuple, &desc); qr_outNamedTuple.tp_repr = (reprfunc)torch::utils::returned_structseq_repr; is_initialized = true; } return &qr_outNamedTuple; } PyTypeObject* get_qr_namedtuple() { static PyStructSequence_Field NamedTuple_fields[] = { {"Q", ""}, {"R", ""}, {nullptr} }; static PyTypeObject qrNamedTuple1; static bool is_initialized = false; static PyStructSequence_Desc desc = { "torch.return_types.qr", nullptr, NamedTuple_fields, 2 }; if (!is_initialized) { PyStructSequence_InitType(&qrNamedTuple1, &desc); qrNamedTuple1.tp_repr = (reprfunc)torch::utils::returned_structseq_repr; is_initialized = true; } return &qrNamedTuple1; } PyTypeObject* get_slogdet_namedtuple() { static PyStructSequence_Field NamedTuple_fields[] = { {"sign", ""}, {"logabsdet", ""}, {nullptr} }; static PyTypeObject slogdetNamedTuple; static bool is_initialized = false; static PyStructSequence_Desc desc = { "torch.return_types.slogdet", nullptr, NamedTuple_fields, 2 }; if (!is_initialized) { PyStructSequence_InitType(&slogdetNamedTuple, &desc); slogdetNamedTuple.tp_repr = (reprfunc)torch::utils::returned_structseq_repr; is_initialized = true; } return &slogdetNamedTuple; } PyTypeObject* get_solve_namedtuple() { static PyStructSequence_Field NamedTuple_fields[] = { {"solution", ""}, {"LU", ""}, {nullptr} }; static PyTypeObject solveNamedTuple; static bool is_initialized = false; static PyStructSequence_Desc desc = { "torch.return_types.solve", nullptr, NamedTuple_fields, 2 }; if (!is_initialized) { PyStructSequence_InitType(&solveNamedTuple, &desc); solveNamedTuple.tp_repr = (reprfunc)torch::utils::returned_structseq_repr; is_initialized = true; } return &solveNamedTuple; } PyTypeObject* get_solve_out_namedtuple() { static PyStructSequence_Field NamedTuple_fields[] = { {"solution", ""}, {"LU", ""}, {nullptr} }; static PyTypeObject solve_outNamedTuple1; static bool is_initialized = false; static PyStructSequence_Desc desc = { "torch.return_types.solve_out", nullptr, NamedTuple_fields, 2 }; if (!is_initialized) { PyStructSequence_InitType(&solve_outNamedTuple1, &desc); solve_outNamedTuple1.tp_repr = (reprfunc)torch::utils::returned_structseq_repr; is_initialized = true; } return &solve_outNamedTuple1; } PyTypeObject* get_sort_out_namedtuple() { static PyStructSequence_Field NamedTuple_fields[] = { {"values", ""}, {"indices", ""}, {nullptr} }; static PyTypeObject sort_outNamedTuple; static bool is_initialized = false; static PyStructSequence_Desc desc = { "torch.return_types.sort_out", nullptr, NamedTuple_fields, 2 }; if (!is_initialized) { PyStructSequence_InitType(&sort_outNamedTuple, &desc); sort_outNamedTuple.tp_repr = (reprfunc)torch::utils::returned_structseq_repr; is_initialized = true; } return &sort_outNamedTuple; } PyTypeObject* get_sort_namedtuple() { static PyStructSequence_Field NamedTuple_fields[] = { {"values", ""}, {"indices", ""}, {nullptr} }; static PyTypeObject sortNamedTuple1; static bool is_initialized = false; static PyStructSequence_Desc desc = { "torch.return_types.sort", nullptr, NamedTuple_fields, 2 }; if (!is_initialized) { PyStructSequence_InitType(&sortNamedTuple1, &desc); sortNamedTuple1.tp_repr = (reprfunc)torch::utils::returned_structseq_repr; is_initialized = true; } return &sortNamedTuple1; } PyTypeObject* get_svd_out_namedtuple() { static PyStructSequence_Field NamedTuple_fields[] = { {"U", ""}, {"S", ""}, {"V", ""}, {nullptr} }; static PyTypeObject svd_outNamedTuple; static bool is_initialized = false; static PyStructSequence_Desc desc = { "torch.return_types.svd_out", nullptr, NamedTuple_fields, 3 }; if (!is_initialized) { PyStructSequence_InitType(&svd_outNamedTuple, &desc); svd_outNamedTuple.tp_repr = (reprfunc)torch::utils::returned_structseq_repr; is_initialized = true; } return &svd_outNamedTuple; } PyTypeObject* get_svd_namedtuple() { static PyStructSequence_Field NamedTuple_fields[] = { {"U", ""}, {"S", ""}, {"V", ""}, {nullptr} }; static PyTypeObject svdNamedTuple1; static bool is_initialized = false; static PyStructSequence_Desc desc = { "torch.return_types.svd", nullptr, NamedTuple_fields, 3 }; if (!is_initialized) { PyStructSequence_InitType(&svdNamedTuple1, &desc); svdNamedTuple1.tp_repr = (reprfunc)torch::utils::returned_structseq_repr; is_initialized = true; } return &svdNamedTuple1; } PyTypeObject* get_symeig_out_namedtuple() { static PyStructSequence_Field NamedTuple_fields[] = { {"eigenvalues", ""}, {"eigenvectors", ""}, {nullptr} }; static PyTypeObject symeig_outNamedTuple; static bool is_initialized = false; static PyStructSequence_Desc desc = { "torch.return_types.symeig_out", nullptr, NamedTuple_fields, 2 }; if (!is_initialized) { PyStructSequence_InitType(&symeig_outNamedTuple, &desc); symeig_outNamedTuple.tp_repr = (reprfunc)torch::utils::returned_structseq_repr; is_initialized = true; } return &symeig_outNamedTuple; } PyTypeObject* get_symeig_namedtuple() { static PyStructSequence_Field NamedTuple_fields[] = { {"eigenvalues", ""}, {"eigenvectors", ""}, {nullptr} }; static PyTypeObject symeigNamedTuple1; static bool is_initialized = false; static PyStructSequence_Desc desc = { "torch.return_types.symeig", nullptr, NamedTuple_fields, 2 }; if (!is_initialized) { PyStructSequence_InitType(&symeigNamedTuple1, &desc); symeigNamedTuple1.tp_repr = (reprfunc)torch::utils::returned_structseq_repr; is_initialized = true; } return &symeigNamedTuple1; } PyTypeObject* get_topk_out_namedtuple() { static PyStructSequence_Field NamedTuple_fields[] = { {"values", ""}, {"indices", ""}, {nullptr} }; static PyTypeObject topk_outNamedTuple; static bool is_initialized = false; static PyStructSequence_Desc desc = { "torch.return_types.topk_out", nullptr, NamedTuple_fields, 2 }; if (!is_initialized) { PyStructSequence_InitType(&topk_outNamedTuple, &desc); topk_outNamedTuple.tp_repr = (reprfunc)torch::utils::returned_structseq_repr; is_initialized = true; } return &topk_outNamedTuple; } PyTypeObject* get_topk_namedtuple() { static PyStructSequence_Field NamedTuple_fields[] = { {"values", ""}, {"indices", ""}, {nullptr} }; static PyTypeObject topkNamedTuple1; static bool is_initialized = false; static PyStructSequence_Desc desc = { "torch.return_types.topk", nullptr, NamedTuple_fields, 2 }; if (!is_initialized) { PyStructSequence_InitType(&topkNamedTuple1, &desc); topkNamedTuple1.tp_repr = (reprfunc)torch::utils::returned_structseq_repr; is_initialized = true; } return &topkNamedTuple1; } PyTypeObject* get_triangular_solve_out_namedtuple() { static PyStructSequence_Field NamedTuple_fields[] = { {"solution", ""}, {"cloned_coefficient", ""}, {nullptr} }; static PyTypeObject triangular_solve_outNamedTuple; static bool is_initialized = false; static PyStructSequence_Desc desc = { "torch.return_types.triangular_solve_out", nullptr, NamedTuple_fields, 2 }; if (!is_initialized) { PyStructSequence_InitType(&triangular_solve_outNamedTuple, &desc); triangular_solve_outNamedTuple.tp_repr = (reprfunc)torch::utils::returned_structseq_repr; is_initialized = true; } return &triangular_solve_outNamedTuple; } PyTypeObject* get_triangular_solve_namedtuple() { static PyStructSequence_Field NamedTuple_fields[] = { {"solution", ""}, {"cloned_coefficient", ""}, {nullptr} }; static PyTypeObject triangular_solveNamedTuple1; static bool is_initialized = false; static PyStructSequence_Desc desc = { "torch.return_types.triangular_solve", nullptr, NamedTuple_fields, 2 }; if (!is_initialized) { PyStructSequence_InitType(&triangular_solveNamedTuple1, &desc); triangular_solveNamedTuple1.tp_repr = (reprfunc)torch::utils::returned_structseq_repr; is_initialized = true; } return &triangular_solveNamedTuple1; } } namespace torch { namespace autograd { std::map<std::string, PyTypeObject*>& get_namedtuple_types_map() { // [NOTE] Non-global map // This map calls Python functions during its initialization. // If it is a global static variable and in case it is loaded // before Python interpreter is ready, then the calls it makes during // initialization will SEGFAULT. // To avoid this we make it function static variable so that it is // initialized only after the Python interpreter is ready. static std::map<std::string, PyTypeObject*> namedtuple_types_map = { {"_det_lu_based_helper", get__det_lu_based_helper_namedtuple()}, {"_fake_quantize_per_tensor_affine_cachemask_tensor_qparams", get__fake_quantize_per_tensor_affine_cachemask_tensor_qparams_namedtuple()}, {"_fused_moving_avg_obs_fq_helper", get__fused_moving_avg_obs_fq_helper_namedtuple()}, {"_lu_with_info", get__lu_with_info_namedtuple()}, {"_unpack_dual", get__unpack_dual_namedtuple()}, {"aminmax", get_aminmax_namedtuple()}, {"aminmax_out", get_aminmax_out_namedtuple()}, {"cummax", get_cummax_namedtuple()}, {"cummax_out", get_cummax_out_namedtuple()}, {"cummin", get_cummin_namedtuple()}, {"cummin_out", get_cummin_out_namedtuple()}, {"eig_out", get_eig_out_namedtuple()}, {"eig", get_eig_namedtuple()}, {"frexp", get_frexp_namedtuple()}, {"frexp_out", get_frexp_out_namedtuple()}, {"geqrf_out", get_geqrf_out_namedtuple()}, {"geqrf", get_geqrf_namedtuple()}, {"histogram_out", get_histogram_out_namedtuple()}, {"histogram", get_histogram_namedtuple()}, {"kthvalue", get_kthvalue_namedtuple()}, {"kthvalue_out", get_kthvalue_out_namedtuple()}, {"linalg_cholesky_ex", get_linalg_cholesky_ex_namedtuple()}, {"linalg_cholesky_ex_out", get_linalg_cholesky_ex_out_namedtuple()}, {"linalg_eig", get_linalg_eig_namedtuple()}, {"linalg_eig_out", get_linalg_eig_out_namedtuple()}, {"linalg_eigh", get_linalg_eigh_namedtuple()}, {"linalg_eigh_out", get_linalg_eigh_out_namedtuple()}, {"linalg_inv_ex", get_linalg_inv_ex_namedtuple()}, {"linalg_inv_ex_out", get_linalg_inv_ex_out_namedtuple()}, {"linalg_lstsq", get_linalg_lstsq_namedtuple()}, {"linalg_lstsq_out", get_linalg_lstsq_out_namedtuple()}, {"linalg_qr", get_linalg_qr_namedtuple()}, {"linalg_qr_out", get_linalg_qr_out_namedtuple()}, {"linalg_slogdet", get_linalg_slogdet_namedtuple()}, {"linalg_slogdet_out", get_linalg_slogdet_out_namedtuple()}, {"linalg_svd_out", get_linalg_svd_out_namedtuple()}, {"linalg_svd", get_linalg_svd_namedtuple()}, {"lstsq_out", get_lstsq_out_namedtuple()}, {"lstsq", get_lstsq_namedtuple()}, {"lu_unpack", get_lu_unpack_namedtuple()}, {"lu_unpack_out", get_lu_unpack_out_namedtuple()}, {"max", get_max_namedtuple()}, {"max_out", get_max_out_namedtuple()}, {"median", get_median_namedtuple()}, {"median_out", get_median_out_namedtuple()}, {"min", get_min_namedtuple()}, {"min_out", get_min_out_namedtuple()}, {"mode", get_mode_namedtuple()}, {"mode_out", get_mode_out_namedtuple()}, {"nanmedian", get_nanmedian_namedtuple()}, {"nanmedian_out", get_nanmedian_out_namedtuple()}, {"qr_out", get_qr_out_namedtuple()}, {"qr", get_qr_namedtuple()}, {"slogdet", get_slogdet_namedtuple()}, {"solve", get_solve_namedtuple()}, {"solve_out", get_solve_out_namedtuple()}, {"sort_out", get_sort_out_namedtuple()}, {"sort", get_sort_namedtuple()}, {"svd_out", get_svd_out_namedtuple()}, {"svd", get_svd_namedtuple()}, {"symeig_out", get_symeig_out_namedtuple()}, {"symeig", get_symeig_namedtuple()}, {"topk_out", get_topk_out_namedtuple()}, {"topk", get_topk_namedtuple()}, {"triangular_solve_out", get_triangular_solve_out_namedtuple()}, {"triangular_solve", get_triangular_solve_namedtuple()}, }; return namedtuple_types_map; } PyTypeObject* get_namedtuple(std::string name) { static auto& namedtuple_types_map = get_namedtuple_types_map(); return namedtuple_types_map[name]; } void initReturnTypes(PyObject* module) { static struct PyModuleDef def = { PyModuleDef_HEAD_INIT, "torch._C._return_types", nullptr, -1, {}}; PyObject* return_types_module = PyModule_Create(&def); if (!return_types_module) { throw python_error(); } for (const auto& return_type_pair : get_namedtuple_types_map()) { // hold onto the TypeObject for the unlikely case of user // deleting or overriding it. Py_INCREF(return_type_pair.second); if (PyModule_AddObject( return_types_module, return_type_pair.first.c_str(), (PyObject*)return_type_pair.second) != 0) { Py_DECREF((PyObject*)return_type_pair.second); throw python_error(); } } // steals a reference to return_types on success if (PyModule_AddObject(module, "_return_types", return_types_module) != 0) { Py_DECREF(return_types_module); throw python_error(); } } } // namespace autograd } // namespace torch ``` </details> <details> <summary>Eg. updated call in other python_*_functions</summary> ```cpp // linalg_cholesky_ex static PyObject * THPVariable_linalg_cholesky_ex(PyObject* self_, PyObject* args, PyObject* kwargs) { HANDLE_TH_ERRORS static PyTypeObject* NamedTuple = get_namedtuple("linalg_cholesky_ex"); static PyTypeObject* NamedTuple1 = get_namedtuple("linalg_cholesky_ex_out"); static PythonArgParser parser({ "linalg_cholesky_ex(Tensor input, *, bool upper=False, bool check_errors=False, TensorList[2] out=None)", }, /*traceable=*/true); ParsedArgs<4> parsed_args; auto _r = parser.parse(nullptr, args, kwargs, parsed_args); if(_r.has_torch_function()) { return handle_torch_function(_r, nullptr, args, kwargs, THPLinalgVariableFunctionsModule, "torch.linalg"); } if (_r.isNone(3)) { // aten::linalg_cholesky_ex(Tensor self, *, bool upper=False, bool check_errors=False) -> (Tensor L, Tensor info) auto dispatch_linalg_cholesky_ex = [](const at::Tensor & self, bool upper, bool check_errors) -> ::std::tuple<at::Tensor,at::Tensor> { pybind11::gil_scoped_release no_gil; return at::linalg_cholesky_ex(self, upper, check_errors); }; return wrap(NamedTuple, dispatch_linalg_cholesky_ex(_r.tensor(0), _r.toBool(1), _r.toBool(2))); } else { // aten::linalg_cholesky_ex.L(Tensor self, *, bool upper=False, bool check_errors=False, Tensor(a!) L, Tensor(b!) info) -> (Tensor(a!) L, Tensor(b!) info) auto out = _r.tensorlist_n<2>(3); auto dispatch_linalg_cholesky_ex_out = [](at::Tensor & L, at::Tensor & info, const at::Tensor & self, bool upper, bool check_errors) -> ::std::tuple<at::Tensor,at::Tensor> { pybind11::gil_scoped_release no_gil; return at::linalg_cholesky_ex_out(L, info, self, upper, check_errors); }; return wrap(NamedTuple1, dispatch_linalg_cholesky_ex_out(out[0], out[1], _r.tensor(0), _r.toBool(1), _r.toBool(2))); } Py_RETURN_NONE; END_HANDLE_TH_ERRORS } ``` </details> Pull Request resolved: https://github.com/pytorch/pytorch/pull/66614 Reviewed By: H-Huang Differential Revision: D32741134 Pulled By: zou3519 fbshipit-source-id: 27bada30d20e66333ca1be1775608d9f0cbf9f59
2021-12-06 17:03:11 +00:00
"torch/csrc/autograd/generated/python_return_types.cpp",
]
genrule(
name = "all_generated_code",
srcs = [
[pytorch] rewrite of the python binding codegen with the v2 API (#46244) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/46244 - What does the generated binding code do? The Python binding codegen produces code that takes the input list of PyObjects, finds the matching ATen C++ function using PythonArgParser, converts the PyObjects into C++ types and calls the ATen C++ function: ``` +--------+ parsing +------------------------+ binding +-----------------------+ | PyObjs | ---------> | PythonArgParser Output | ---------> | Cpp Function Dispatch | +--------+ +------------------------+ +-----------------------+ ``` - Are Python arguments 1-1 mapped to C++ arguments? Python arguments might be reordered, packed, unpacked when binding to C++ arguments, as illustrated below: ``` // Binding - Reorder & Packing // aten::empty.names(int[] size, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor Python Args Cpp Args ----------------------------------------------------------- 0: size size 1: names names 2: memory_format -------+ 3: dtype -----+-|--> options 4: layout / | 5: device / +--> memory_format 6: pin_memory / 7: requires_grad -+ // Binding - Unpacking // aten::max.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices) Python Args Cpp Args ----------------------------------------------------------- +----> max /-----> max_values 0: input / self 1: dim / dim 2: keepdim / keepdim 3: out -----+ ``` - Why do we want to rewrite the python binding codegen? The old codegen takes Declarations.yaml as input. It doesn't distinguish between Python arguments and C++ arguments - they are all mixed together as a bag of non-typed dict objects. Different methods process these arg objects and add new attributes for various different purposes. It's not so obvious to figure out the semantics of these attributes. The complicated binding logic happens implicitly and scatteredly. ``` +--------------------+ | Native Functions | +--------------------+ | | v +--------------------+ | Cpp Signatures | +--------------------+ | | v +--------------------+ | Declarations.yaml | +--------------------+ | +-------------------------------------+ | +-------> | PythonArgParser Schema | | | +-------------------------------------+ | | . | | . v | . +--------------------+ +-------------------------------------+ | NonTyped Args Objs | --> | PythonArgParser -> Cpp Args Binding | +--------------------+ +-------------------------------------+ | . | . | . | +-------------------------------------+ +-------> | Cpp Function Dispatch | +-------------------------------------+ ``` This PR leverages the new immutable data models introduced in the new aten codegen. It introduces dedicated data models for python schema. This way, we can not only avoid subtle Declaration.yaml conversions but also decouple the generation of python schema, python to c++ binding and c++ function call. The ultimate state will be like the following diagram: ``` +-------------------+ +-------------------------------------+ +-------> | Python Signatures | --> | PythonArgParser Schema | | +-------------------+ +-------------------------------------+ | | . | | . | | . +------------------+ | +-------------------------------------+ | Native Functions | +-------> | PythonArgParser -> Cpp Args Binding | +------------------+ | +-------------------------------------+ | | . | | . | | . | +-------------------+ +-------------------------------------+ +-------> | Cpp Signatures | --> | Cpp Function Dispatch | +-------------------+ +-------------------------------------+ ``` This PR has migrated the core binding logic from tools/autograd/gen_python_functions.py to tools/codegen/api/python.py. It produces the byte-for-byte same results (tested with #46243). Will migrate the rest of gen_python_functions.py in subsequent PRs. Test Plan: Imported from OSS Reviewed By: bhosmer Differential Revision: D24388874 Pulled By: ljk53 fbshipit-source-id: f88b6df4e917cf90d868a2bbae2d5ffb680d1841
2020-10-20 00:34:45 +00:00
"aten/src/ATen/native/native_functions.yaml",
],
outs = libtorch_cpp_generated_sources + libtorch_python_generated_sources,
cmd = "$(location :generate_code) --install_dir `dirname $(location torch/csrc/autograd/generated/variable_factories.h)`/../.. --native-functions-path $(location aten/src/ATen/native/native_functions.yaml) --nn-path aten/src",
tools = [":generate_code"],
)
filegroup(
name = "cpp_generated_code",
data = [":all_generated_code"],
srcs = libtorch_cpp_generated_sources,
)
filegroup(
name = "python_generated_code",
data = [":all_generated_code"],
srcs = libtorch_python_generated_sources,
)
exports_files(
srcs = ["aten/src/ATen/cpu/tbb/extra/version_string.ver.in"],
)
# ATen
filegroup(
name = "aten_base_cpp",
srcs = glob([
"aten/src/ATen/*.cpp",
"aten/src/ATen/detail/*.cpp",
"aten/src/ATen/cpu/*.cpp",
]),
)
filegroup(
name = "ATen_CORE_SRCS",
srcs = glob(
[
"aten/src/ATen/core/**/*.cpp",
],
exclude = [
"aten/src/ATen/core/**/*_test.cpp",
],
),
)
filegroup(
name = "aten_native_cpp",
srcs = glob(["aten/src/ATen/native/*.cpp"]),
)
filegroup(
name = "aten_native_sparse_cpp",
srcs = glob(["aten/src/ATen/native/sparse/*.cpp"]),
)
filegroup(
name = "aten_native_quantized_cpp",
srcs = glob(
[
"aten/src/ATen/native/quantized/*.cpp",
"aten/src/ATen/native/quantized/cpu/*.cpp",
],
),
)
filegroup(
name = "aten_native_mkl_cpp",
srcs = glob(["aten/src/ATen/native/mkl/*.cpp"]),
)
filegroup(
name = "aten_native_mkldnn_cpp",
srcs = glob(["aten/src/ATen/native/mkldnn/*.cpp"]),
)
filegroup(
name = "aten_native_xnnpack",
srcs = glob(["aten/src/ATen/native/xnnpack/*.cpp"]),
)
filegroup(
name = "aten_base_vulkan",
srcs = glob(["aten/src/ATen/vulkan/*.cpp"]),
)
filegroup(
name = "aten_base_metal",
srcs = glob(["aten/src/ATen/metal/*.cpp"]),
)
filegroup(
name = "ATen_QUANTIZED_SRCS",
srcs = glob(
[
"aten/src/ATen/quantized/**/*.cpp",
],
exclude = [
"aten/src/ATen/quantized/**/*_test.cpp",
],
),
)
filegroup(
name = "aten_cuda_srcs",
srcs = [
"aten/src/ATen/cuda/CUDABlas.cpp",
Add cusolver to build, rewrite MAGMA inverse with cusolver (#42403) Summary: Fixes https://github.com/pytorch/pytorch/issues/42265 This PR adds cusolver to the pytorch build, and enables the use of cusolver/cublas library functions on GPU `torch.inverse` on certain tensor shapes. Specifically, when * the tensor is two dimensional (single batch), or * has >2 dimensions (multiple batches) and `batch_size <= 2`, or * magma is not linked, cusolver/cublas will be used. In other conditions, the current implementation of MAGMA will still be used. https://github.com/pytorch/pytorch/blob/8c0949ae454b1d2c1b626a5ea19ba5ea6487d305/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu#L742-L752 The reason for this is that for tensors with large batch_size, `cublasXgetrfBatched` and `cublasXgetriBatched` doesn't perform very well. For `batch_size > 1`, we launch cusolver functions in multiple streams. This lets cusolver functions run in parallel, and can greatly increase the performance. When `batch_size > 2`, the parallel launched cusolver functions are slightly slower than the current magma implementation, so we still use the current magma impl. On CUDA 9.2, there were some numerical issues detected, so cusolver impl will not be used. The cusolver impl will also not be used on platforms other than Nvidia CUDA. https://github.com/pytorch/pytorch/blob/060769feaf02db56ac79e0c73dab1105828ece69/aten/src/ATen/native/cuda/BatchLinearAlgebraLib.h#L10-L13 Note that there is a new heuristic used before cusolver/cublas calls here: https://github.com/pytorch/pytorch/blob/8c0949ae454b1d2c1b626a5ea19ba5ea6487d305/aten/src/ATen/native/cuda/MiscUtils.h#L113-L121 where `use_loop_launch = true` means launch single batch cusolver functions in parallel, and `use_loop_launch = false` means use cublas_X_batched functions. When magma is enabled (only `batch_size <= 2` will be dispatched to cusolver/cublas), the heuristic will always return `true` and the cusolver calls are faster than small batch_size magma calls. When magma is disabled, this adds the functionality of `torch.inverse`, which was disabled before for all shapes (though large batch_size cublas performance may not be as well as magma). Checklist: - [X] Add benchmark, cpu, gpu-before (magma), gpu-after (cusolver) - [X] Rewrite single inverse (ndim == 2) with cusolver - [X] Rewrite batched inverse (ndim > 2) with cublas - [X] Add cusolver to build - [x] Clean up functions related to `USE_MAGMA` define guard - [x] Workaround for non-cuda platform - [x] Workaround for cuda 9.2 - [x] Add zero size check - [x] Add tests Next step: If cusolver doesn't cause any problem in pytorch build, and there are no major performance regressions reported after this PR being merged, I will start porting other cusolver/cublas functions for linear algebra to improve the performance. <details> <summary> benchmark 73499c6 </summary> benchmark code: https://github.com/xwang233/code-snippet/blob/master/torch.inverse/inverse-cusolver.ipynb shape meaning: * `[] 2 torch.float32 -> torch.randn(2, 2, dtype=torch.float32)` * `[2] 4 torch.float32 -> torch.randn(2, 4, 4, dtype=torch.float32)` | shape | cpu_time (ms) | gpu_time_before (magma) (ms) | gpu_time_after (ms) | | --- | --- | --- | --- | | [] 2 torch.float32 | 0.095 | 7.534 | 0.129 | | [] 4 torch.float32 | 0.009 | 7.522 | 0.129 | | [] 8 torch.float32 | 0.011 | 7.647 | 0.138 | | [] 16 torch.float32 | 0.075 | 7.582 | 0.135 | | [] 32 torch.float32 | 0.073 | 7.573 | 0.191 | | [] 64 torch.float32 | 0.134 | 7.694 | 0.288 | | [] 128 torch.float32 | 0.398 | 8.073 | 0.491 | | [] 256 torch.float32 | 1.054 | 11.860 | 1.074 | | [] 512 torch.float32 | 5.218 | 14.130 | 2.582 | | [] 1024 torch.float32 | 19.010 | 18.780 | 6.936 | | [1] 2 torch.float32 | 0.009 | 0.113 | 0.128 ***regressed | | [1] 4 torch.float32 | 0.009 | 0.113 | 0.131 ***regressed | | [1] 8 torch.float32 | 0.011 | 0.116 | 0.129 ***regressed | | [1] 16 torch.float32 | 0.015 | 0.122 | 0.135 ***regressed | | [1] 32 torch.float32 | 0.032 | 0.177 | 0.178 ***regressed | | [1] 64 torch.float32 | 0.070 | 0.420 | 0.281 | | [1] 128 torch.float32 | 0.328 | 0.816 | 0.490 | | [1] 256 torch.float32 | 1.125 | 1.690 | 1.084 | | [1] 512 torch.float32 | 4.344 | 4.305 | 2.576 | | [1] 1024 torch.float32 | 16.510 | 16.340 | 6.928 | | [2] 2 torch.float32 | 0.009 | 0.113 | 0.186 ***regressed | | [2] 4 torch.float32 | 0.011 | 0.115 | 0.184 ***regressed | | [2] 8 torch.float32 | 0.012 | 0.114 | 0.184 ***regressed | | [2] 16 torch.float32 | 0.019 | 0.119 | 0.173 ***regressed | | [2] 32 torch.float32 | 0.050 | 0.170 | 0.240 ***regressed | | [2] 64 torch.float32 | 0.120 | 0.429 | 0.375 | | [2] 128 torch.float32 | 0.576 | 0.830 | 0.675 | | [2] 256 torch.float32 | 2.021 | 1.748 | 1.451 | | [2] 512 torch.float32 | 9.070 | 4.749 | 3.539 | | [2] 1024 torch.float32 | 33.655 | 18.240 | 12.220 | | [4] 2 torch.float32 | 0.009 | 0.112 | 0.318 ***regressed | | [4] 4 torch.float32 | 0.010 | 0.115 | 0.319 ***regressed | | [4] 8 torch.float32 | 0.013 | 0.115 | 0.320 ***regressed | | [4] 16 torch.float32 | 0.027 | 0.120 | 0.331 ***regressed | | [4] 32 torch.float32 | 0.085 | 0.173 | 0.385 ***regressed | | [4] 64 torch.float32 | 0.221 | 0.431 | 0.646 ***regressed | | [4] 128 torch.float32 | 1.102 | 0.834 | 1.055 ***regressed | | [4] 256 torch.float32 | 4.042 | 1.811 | 2.054 ***regressed | | [4] 512 torch.float32 | 18.390 | 4.884 | 5.087 ***regressed | | [4] 1024 torch.float32 | 69.025 | 19.840 | 20.000 ***regressed | </details> Pull Request resolved: https://github.com/pytorch/pytorch/pull/42403 Reviewed By: ailzhang, mruberry Differential Revision: D23717984 Pulled By: ngimel fbshipit-source-id: 54cbd9ea72a97989cff4127089938e8a8e29a72b
2020-09-19 03:40:39 +00:00
"aten/src/ATen/cuda/CUDASolver.cpp",
"aten/src/ATen/cuda/CUDAContext.cpp",
"aten/src/ATen/cuda/CUDAGeneratorImpl.cpp",
"aten/src/ATen/cuda/CUDAGraph.cpp",
"aten/src/ATen/cuda/CuSparseHandlePool.cpp",
"aten/src/ATen/cuda/CublasHandlePool.cpp",
Add cusolver to build, rewrite MAGMA inverse with cusolver (#42403) Summary: Fixes https://github.com/pytorch/pytorch/issues/42265 This PR adds cusolver to the pytorch build, and enables the use of cusolver/cublas library functions on GPU `torch.inverse` on certain tensor shapes. Specifically, when * the tensor is two dimensional (single batch), or * has >2 dimensions (multiple batches) and `batch_size <= 2`, or * magma is not linked, cusolver/cublas will be used. In other conditions, the current implementation of MAGMA will still be used. https://github.com/pytorch/pytorch/blob/8c0949ae454b1d2c1b626a5ea19ba5ea6487d305/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu#L742-L752 The reason for this is that for tensors with large batch_size, `cublasXgetrfBatched` and `cublasXgetriBatched` doesn't perform very well. For `batch_size > 1`, we launch cusolver functions in multiple streams. This lets cusolver functions run in parallel, and can greatly increase the performance. When `batch_size > 2`, the parallel launched cusolver functions are slightly slower than the current magma implementation, so we still use the current magma impl. On CUDA 9.2, there were some numerical issues detected, so cusolver impl will not be used. The cusolver impl will also not be used on platforms other than Nvidia CUDA. https://github.com/pytorch/pytorch/blob/060769feaf02db56ac79e0c73dab1105828ece69/aten/src/ATen/native/cuda/BatchLinearAlgebraLib.h#L10-L13 Note that there is a new heuristic used before cusolver/cublas calls here: https://github.com/pytorch/pytorch/blob/8c0949ae454b1d2c1b626a5ea19ba5ea6487d305/aten/src/ATen/native/cuda/MiscUtils.h#L113-L121 where `use_loop_launch = true` means launch single batch cusolver functions in parallel, and `use_loop_launch = false` means use cublas_X_batched functions. When magma is enabled (only `batch_size <= 2` will be dispatched to cusolver/cublas), the heuristic will always return `true` and the cusolver calls are faster than small batch_size magma calls. When magma is disabled, this adds the functionality of `torch.inverse`, which was disabled before for all shapes (though large batch_size cublas performance may not be as well as magma). Checklist: - [X] Add benchmark, cpu, gpu-before (magma), gpu-after (cusolver) - [X] Rewrite single inverse (ndim == 2) with cusolver - [X] Rewrite batched inverse (ndim > 2) with cublas - [X] Add cusolver to build - [x] Clean up functions related to `USE_MAGMA` define guard - [x] Workaround for non-cuda platform - [x] Workaround for cuda 9.2 - [x] Add zero size check - [x] Add tests Next step: If cusolver doesn't cause any problem in pytorch build, and there are no major performance regressions reported after this PR being merged, I will start porting other cusolver/cublas functions for linear algebra to improve the performance. <details> <summary> benchmark 73499c6 </summary> benchmark code: https://github.com/xwang233/code-snippet/blob/master/torch.inverse/inverse-cusolver.ipynb shape meaning: * `[] 2 torch.float32 -> torch.randn(2, 2, dtype=torch.float32)` * `[2] 4 torch.float32 -> torch.randn(2, 4, 4, dtype=torch.float32)` | shape | cpu_time (ms) | gpu_time_before (magma) (ms) | gpu_time_after (ms) | | --- | --- | --- | --- | | [] 2 torch.float32 | 0.095 | 7.534 | 0.129 | | [] 4 torch.float32 | 0.009 | 7.522 | 0.129 | | [] 8 torch.float32 | 0.011 | 7.647 | 0.138 | | [] 16 torch.float32 | 0.075 | 7.582 | 0.135 | | [] 32 torch.float32 | 0.073 | 7.573 | 0.191 | | [] 64 torch.float32 | 0.134 | 7.694 | 0.288 | | [] 128 torch.float32 | 0.398 | 8.073 | 0.491 | | [] 256 torch.float32 | 1.054 | 11.860 | 1.074 | | [] 512 torch.float32 | 5.218 | 14.130 | 2.582 | | [] 1024 torch.float32 | 19.010 | 18.780 | 6.936 | | [1] 2 torch.float32 | 0.009 | 0.113 | 0.128 ***regressed | | [1] 4 torch.float32 | 0.009 | 0.113 | 0.131 ***regressed | | [1] 8 torch.float32 | 0.011 | 0.116 | 0.129 ***regressed | | [1] 16 torch.float32 | 0.015 | 0.122 | 0.135 ***regressed | | [1] 32 torch.float32 | 0.032 | 0.177 | 0.178 ***regressed | | [1] 64 torch.float32 | 0.070 | 0.420 | 0.281 | | [1] 128 torch.float32 | 0.328 | 0.816 | 0.490 | | [1] 256 torch.float32 | 1.125 | 1.690 | 1.084 | | [1] 512 torch.float32 | 4.344 | 4.305 | 2.576 | | [1] 1024 torch.float32 | 16.510 | 16.340 | 6.928 | | [2] 2 torch.float32 | 0.009 | 0.113 | 0.186 ***regressed | | [2] 4 torch.float32 | 0.011 | 0.115 | 0.184 ***regressed | | [2] 8 torch.float32 | 0.012 | 0.114 | 0.184 ***regressed | | [2] 16 torch.float32 | 0.019 | 0.119 | 0.173 ***regressed | | [2] 32 torch.float32 | 0.050 | 0.170 | 0.240 ***regressed | | [2] 64 torch.float32 | 0.120 | 0.429 | 0.375 | | [2] 128 torch.float32 | 0.576 | 0.830 | 0.675 | | [2] 256 torch.float32 | 2.021 | 1.748 | 1.451 | | [2] 512 torch.float32 | 9.070 | 4.749 | 3.539 | | [2] 1024 torch.float32 | 33.655 | 18.240 | 12.220 | | [4] 2 torch.float32 | 0.009 | 0.112 | 0.318 ***regressed | | [4] 4 torch.float32 | 0.010 | 0.115 | 0.319 ***regressed | | [4] 8 torch.float32 | 0.013 | 0.115 | 0.320 ***regressed | | [4] 16 torch.float32 | 0.027 | 0.120 | 0.331 ***regressed | | [4] 32 torch.float32 | 0.085 | 0.173 | 0.385 ***regressed | | [4] 64 torch.float32 | 0.221 | 0.431 | 0.646 ***regressed | | [4] 128 torch.float32 | 1.102 | 0.834 | 1.055 ***regressed | | [4] 256 torch.float32 | 4.042 | 1.811 | 2.054 ***regressed | | [4] 512 torch.float32 | 18.390 | 4.884 | 5.087 ***regressed | | [4] 1024 torch.float32 | 69.025 | 19.840 | 20.000 ***regressed | </details> Pull Request resolved: https://github.com/pytorch/pytorch/pull/42403 Reviewed By: ailzhang, mruberry Differential Revision: D23717984 Pulled By: ngimel fbshipit-source-id: 54cbd9ea72a97989cff4127089938e8a8e29a72b
2020-09-19 03:40:39 +00:00
"aten/src/ATen/cuda/CusolverDnHandlePool.cpp",
"aten/src/ATen/cuda/PinnedMemoryAllocator.cpp",
"aten/src/ATen/cuda/detail/CUDAHooks.cpp",
Autocast support for cudnn RNNs (#42385) Summary: Should close https://github.com/pytorch/pytorch/issues/36428. The cudnn RNN API expects weights to occupy a flat buffer in memory with a particular layout. This PR implements a "speed of light" fix: [`_cudnn_rnn_cast_reflatten`](https://github.com/pytorch/pytorch/pull/42385/files#diff-9ef93b6a4fb5a06a37c562b83737ac6aR327) (the autocast wrapper assigned to `_cudnn_rnn`) copies weights to the right slices of a flat FP16 buffer with a single read/write per weight (as opposed to casting them to FP16 individually then reflattening the individual FP16 weights, which would require 2 read/writes per weight). It isn't pretty but IMO it doesn't make rnn bindings much more tortuous than they already are. The [test](https://github.com/pytorch/pytorch/pull/42385/files#diff-e68a7bc6ba14f212e5e7eb3727394b40R2683) tries a forward under autocast and a backward for the full cross product of RNN options and input/weight/hidden dtypes. As for all FP16list autocast tests, forward output and backward grads are checked against a control where inputs (including RNN module weights in this case) are precasted to FP16 on the python side. Not sure who to ask for review, tagging ezyang and ngimel because Ed wrote this file (almost 2 years ago) and Natalia did the most recent major [surgery](https://github.com/pytorch/pytorch/pull/12600). Side quests discovered: - Should we update [persistent RNN heuristics](https://github.com/pytorch/pytorch/blob/dbdd28207c5cf6c4a35ceb1de0811c4812e8882c/aten/src/ATen/native/cudnn/RNN.cpp#L584) to include compute capability 8.0? Could be another PR but seems easy enough to include. - Many (maybe all?!) the raw cudnn API calls in [RNN.cpp](https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/cudnn/RNN.cpp) are deprecated in cudnn 8. I don't mind taking the AI to update them since my mental cache is full of rnn stuff, but that would be a substantial separate PR. Pull Request resolved: https://github.com/pytorch/pytorch/pull/42385 Reviewed By: zhangguanheng66 Differential Revision: D23077782 Pulled By: ezyang fbshipit-source-id: a2afb1bdab33ba0442879a703df13dc87f03ec2e
2020-08-18 20:36:02 +00:00
"aten/src/ATen/cudnn/AutocastRNN.cpp",
"aten/src/ATen/cudnn/Descriptors.cpp",
"aten/src/ATen/cudnn/Handle.cpp",
"aten/src/ATen/cudnn/Types.cpp",
"aten/src/ATen/native/cuda/CUDAUnaryOps.cpp",
"aten/src/ATen/native/cuda/TensorShapeCUDA.cpp",
"aten/src/ATen/native/cudnn/AffineGridGenerator.cpp",
"aten/src/ATen/native/cudnn/BatchNorm.cpp",
"aten/src/ATen/native/cudnn/Conv.cpp",
"aten/src/ATen/native/cudnn/GridSampler.cpp",
"aten/src/ATen/native/cudnn/LossCTC.cpp",
"aten/src/ATen/native/cudnn/RNN.cpp",
"aten/src/ATen/native/miopen/BatchNorm_miopen.cpp",
"aten/src/ATen/native/miopen/Conv_miopen.cpp",
"aten/src/ATen/native/miopen/RNN_miopen.cpp",
"aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cpp",
"aten/src/ATen/native/sparse/cuda/SparseBlas.cpp",
"aten/src/ATen/native/sparse/cuda/SparseBlasImpl.cpp",
],
)
filegroup(
name = "aten_srcs_cu",
srcs = [
"aten/src/ATen/cuda/cub.cu.cc",
"aten/src/ATen/cuda/detail/IndexUtils.cu.cc",
"aten/src/ATen/cuda/detail/CUDAGraphsUtils.cu.cc",
"aten/src/ATen/native/cuda/Activation.cu.cc",
"aten/src/ATen/native/cuda/AdaptiveAveragePooling.cu.cc",
"aten/src/ATen/native/cuda/AdaptiveAveragePooling3d.cu.cc",
"aten/src/ATen/native/cuda/AdaptiveMaxPooling2d.cu.cc",
"aten/src/ATen/native/cuda/AdaptiveMaxPooling3d.cu.cc",
"aten/src/ATen/native/cuda/AveragePool2d.cu.cc",
"aten/src/ATen/native/cuda/AveragePool3d.cu.cc",
"aten/src/ATen/native/cuda/BatchLinearAlgebra.cu.cc",
Add cusolver to build, rewrite MAGMA inverse with cusolver (#42403) Summary: Fixes https://github.com/pytorch/pytorch/issues/42265 This PR adds cusolver to the pytorch build, and enables the use of cusolver/cublas library functions on GPU `torch.inverse` on certain tensor shapes. Specifically, when * the tensor is two dimensional (single batch), or * has >2 dimensions (multiple batches) and `batch_size <= 2`, or * magma is not linked, cusolver/cublas will be used. In other conditions, the current implementation of MAGMA will still be used. https://github.com/pytorch/pytorch/blob/8c0949ae454b1d2c1b626a5ea19ba5ea6487d305/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu#L742-L752 The reason for this is that for tensors with large batch_size, `cublasXgetrfBatched` and `cublasXgetriBatched` doesn't perform very well. For `batch_size > 1`, we launch cusolver functions in multiple streams. This lets cusolver functions run in parallel, and can greatly increase the performance. When `batch_size > 2`, the parallel launched cusolver functions are slightly slower than the current magma implementation, so we still use the current magma impl. On CUDA 9.2, there were some numerical issues detected, so cusolver impl will not be used. The cusolver impl will also not be used on platforms other than Nvidia CUDA. https://github.com/pytorch/pytorch/blob/060769feaf02db56ac79e0c73dab1105828ece69/aten/src/ATen/native/cuda/BatchLinearAlgebraLib.h#L10-L13 Note that there is a new heuristic used before cusolver/cublas calls here: https://github.com/pytorch/pytorch/blob/8c0949ae454b1d2c1b626a5ea19ba5ea6487d305/aten/src/ATen/native/cuda/MiscUtils.h#L113-L121 where `use_loop_launch = true` means launch single batch cusolver functions in parallel, and `use_loop_launch = false` means use cublas_X_batched functions. When magma is enabled (only `batch_size <= 2` will be dispatched to cusolver/cublas), the heuristic will always return `true` and the cusolver calls are faster than small batch_size magma calls. When magma is disabled, this adds the functionality of `torch.inverse`, which was disabled before for all shapes (though large batch_size cublas performance may not be as well as magma). Checklist: - [X] Add benchmark, cpu, gpu-before (magma), gpu-after (cusolver) - [X] Rewrite single inverse (ndim == 2) with cusolver - [X] Rewrite batched inverse (ndim > 2) with cublas - [X] Add cusolver to build - [x] Clean up functions related to `USE_MAGMA` define guard - [x] Workaround for non-cuda platform - [x] Workaround for cuda 9.2 - [x] Add zero size check - [x] Add tests Next step: If cusolver doesn't cause any problem in pytorch build, and there are no major performance regressions reported after this PR being merged, I will start porting other cusolver/cublas functions for linear algebra to improve the performance. <details> <summary> benchmark 73499c6 </summary> benchmark code: https://github.com/xwang233/code-snippet/blob/master/torch.inverse/inverse-cusolver.ipynb shape meaning: * `[] 2 torch.float32 -> torch.randn(2, 2, dtype=torch.float32)` * `[2] 4 torch.float32 -> torch.randn(2, 4, 4, dtype=torch.float32)` | shape | cpu_time (ms) | gpu_time_before (magma) (ms) | gpu_time_after (ms) | | --- | --- | --- | --- | | [] 2 torch.float32 | 0.095 | 7.534 | 0.129 | | [] 4 torch.float32 | 0.009 | 7.522 | 0.129 | | [] 8 torch.float32 | 0.011 | 7.647 | 0.138 | | [] 16 torch.float32 | 0.075 | 7.582 | 0.135 | | [] 32 torch.float32 | 0.073 | 7.573 | 0.191 | | [] 64 torch.float32 | 0.134 | 7.694 | 0.288 | | [] 128 torch.float32 | 0.398 | 8.073 | 0.491 | | [] 256 torch.float32 | 1.054 | 11.860 | 1.074 | | [] 512 torch.float32 | 5.218 | 14.130 | 2.582 | | [] 1024 torch.float32 | 19.010 | 18.780 | 6.936 | | [1] 2 torch.float32 | 0.009 | 0.113 | 0.128 ***regressed | | [1] 4 torch.float32 | 0.009 | 0.113 | 0.131 ***regressed | | [1] 8 torch.float32 | 0.011 | 0.116 | 0.129 ***regressed | | [1] 16 torch.float32 | 0.015 | 0.122 | 0.135 ***regressed | | [1] 32 torch.float32 | 0.032 | 0.177 | 0.178 ***regressed | | [1] 64 torch.float32 | 0.070 | 0.420 | 0.281 | | [1] 128 torch.float32 | 0.328 | 0.816 | 0.490 | | [1] 256 torch.float32 | 1.125 | 1.690 | 1.084 | | [1] 512 torch.float32 | 4.344 | 4.305 | 2.576 | | [1] 1024 torch.float32 | 16.510 | 16.340 | 6.928 | | [2] 2 torch.float32 | 0.009 | 0.113 | 0.186 ***regressed | | [2] 4 torch.float32 | 0.011 | 0.115 | 0.184 ***regressed | | [2] 8 torch.float32 | 0.012 | 0.114 | 0.184 ***regressed | | [2] 16 torch.float32 | 0.019 | 0.119 | 0.173 ***regressed | | [2] 32 torch.float32 | 0.050 | 0.170 | 0.240 ***regressed | | [2] 64 torch.float32 | 0.120 | 0.429 | 0.375 | | [2] 128 torch.float32 | 0.576 | 0.830 | 0.675 | | [2] 256 torch.float32 | 2.021 | 1.748 | 1.451 | | [2] 512 torch.float32 | 9.070 | 4.749 | 3.539 | | [2] 1024 torch.float32 | 33.655 | 18.240 | 12.220 | | [4] 2 torch.float32 | 0.009 | 0.112 | 0.318 ***regressed | | [4] 4 torch.float32 | 0.010 | 0.115 | 0.319 ***regressed | | [4] 8 torch.float32 | 0.013 | 0.115 | 0.320 ***regressed | | [4] 16 torch.float32 | 0.027 | 0.120 | 0.331 ***regressed | | [4] 32 torch.float32 | 0.085 | 0.173 | 0.385 ***regressed | | [4] 64 torch.float32 | 0.221 | 0.431 | 0.646 ***regressed | | [4] 128 torch.float32 | 1.102 | 0.834 | 1.055 ***regressed | | [4] 256 torch.float32 | 4.042 | 1.811 | 2.054 ***regressed | | [4] 512 torch.float32 | 18.390 | 4.884 | 5.087 ***regressed | | [4] 1024 torch.float32 | 69.025 | 19.840 | 20.000 ***regressed | </details> Pull Request resolved: https://github.com/pytorch/pytorch/pull/42403 Reviewed By: ailzhang, mruberry Differential Revision: D23717984 Pulled By: ngimel fbshipit-source-id: 54cbd9ea72a97989cff4127089938e8a8e29a72b
2020-09-19 03:40:39 +00:00
"aten/src/ATen/native/cuda/BatchLinearAlgebraLib.cu.cc",
"aten/src/ATen/native/cuda/BinaryArithmeticKernel.cu.cc",
"aten/src/ATen/native/cuda/BinaryCompareKernel.cu.cc",
"aten/src/ATen/native/cuda/BinaryMiscOpsKernels.cu.cc",
"aten/src/ATen/native/cuda/CUDAScalar.cu.cc",
"aten/src/ATen/native/cuda/Col2Im.cu.cc",
"aten/src/ATen/native/cuda/Copy.cu.cc",
"aten/src/ATen/native/cuda/CrossKernel.cu.cc",
"aten/src/ATen/native/cuda/DilatedMaxPool2d.cu.cc",
"aten/src/ATen/native/cuda/DilatedMaxPool3d.cu.cc",
"aten/src/ATen/native/cuda/DistanceKernel.cu.cc",
"aten/src/ATen/native/cuda/Distributions.cu.cc",
"aten/src/ATen/native/cuda/Dropout.cu.cc",
"aten/src/ATen/native/cuda/Embedding.cu.cc",
"aten/src/ATen/native/cuda/EmbeddingBackwardKernel.cu.cc",
"aten/src/ATen/native/cuda/EmbeddingBag.cu.cc",
"aten/src/ATen/native/cuda/FillKernel.cu.cc",
"aten/src/ATen/native/cuda/FractionalMaxPool2d.cu.cc",
"aten/src/ATen/native/cuda/FractionalMaxPool3d.cu.cc",
"aten/src/ATen/native/cuda/GridSampler.cu.cc",
"aten/src/ATen/native/cuda/Im2Col.cu.cc",
"aten/src/ATen/native/cuda/IndexKernel.cu.cc",
"aten/src/ATen/native/cuda/Indexing.cu.cc",
"aten/src/ATen/native/cuda/Lerp.cu.cc",
"aten/src/ATen/native/cuda/LinearAlgebra.cu.cc",
"aten/src/ATen/native/cuda/Loss.cu.cc",
"aten/src/ATen/native/cuda/LossCTC.cu.cc",
"aten/src/ATen/native/cuda/MaxUnpooling.cu.cc",
"aten/src/ATen/native/cuda/MultinomialKernel.cu.cc",
MAINT Migrates multilabel_margin_loss from THC to ATen (CUDA) (#60708) Summary: Fixes https://github.com/pytorch/pytorch/issues/24603 Fixes https://github.com/pytorch/pytorch/issues/24602 <s>The implementation should be exactly the same, so it is strange that the benchmarks show such a significant improvement in this PR.</s> The benchmarks are now the same. <details> <summary>Benchmark script</summary> ```python from itertools import product import torch import torch.nn as nn import torch.nn.functional as F import time torch.manual_seed(0) MS_PER_SECOND = 1000 def _time(): torch.cuda.synchronize() return time.perf_counter() * MS_PER_SECOND device = "cuda" C = 30 n_runs = 100 reductions = ["none", "sum", "mean"] Ns = [1_000, 10_000, 100_000] for reduction, N in product(reductions, Ns): total_fwd_time = 0 total_back_time = 0 grad_out = torch.randn(N, device=device) if reduction != "none": grad_out = grad_out[0] for _ in range(n_runs): input = torch.randn(N, C, device=device, requires_grad=True) target = torch.randint(0, C, size=input.size(), device=device) # forward start = _time() result = F.multilabel_margin_loss(input, target, reduction=reduction) total_fwd_time += _time() - start result = F.multilabel_margin_loss(input, target, reduction=reduction) for _ in range(n_runs): # backward start = _time() result.backward(grad_out, retain_graph=True) total_back_time += _time() - start fwd_avg = total_fwd_time / n_runs bwd_avg = total_back_time / n_runs print( f"input size({N}, {C}), reduction: {reduction}, fwd: {fwd_avg:.2f} (ms), back: {bwd_avg:.2f} (ms)" ) ``` </details> ## master ``` input size(1000, 30), reduction: none, fwd: 0.14 (ms), back: 0.41 (ms) input size(10000, 30), reduction: none, fwd: 1.26 (ms), back: 3.58 (ms) input size(100000, 30), reduction: none, fwd: 13.15 (ms), back: 34.68 (ms) input size(1000, 30), reduction: sum, fwd: 0.14 (ms), back: 0.38 (ms) input size(10000, 30), reduction: sum, fwd: 1.16 (ms), back: 3.53 (ms) input size(100000, 30), reduction: sum, fwd: 13.04 (ms), back: 34.53 (ms) input size(1000, 30), reduction: mean, fwd: 0.14 (ms), back: 0.38 (ms) input size(10000, 30), reduction: mean, fwd: 1.17 (ms), back: 3.52 (ms) input size(100000, 30), reduction: mean, fwd: 13.12 (ms), back: 34.54 (ms) ``` ## this PR ``` input size(1000, 30), reduction: none, fwd: 0.14 (ms), back: 0.35 (ms) input size(10000, 30), reduction: none, fwd: 1.22 (ms), back: 2.98 (ms) input size(100000, 30), reduction: none, fwd: 12.90 (ms), back: 29.32 (ms) input size(1000, 30), reduction: sum, fwd: 0.14 (ms), back: 0.32 (ms) input size(10000, 30), reduction: sum, fwd: 1.16 (ms), back: 2.97 (ms) input size(100000, 30), reduction: sum, fwd: 13.00 (ms), back: 29.17 (ms) input size(1000, 30), reduction: mean, fwd: 0.14 (ms), back: 0.32 (ms) input size(10000, 30), reduction: mean, fwd: 1.17 (ms), back: 2.97 (ms) input size(100000, 30), reduction: mean, fwd: 13.09 (ms), back: 28.91 (ms) ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/60708 Reviewed By: saketh-are Differential Revision: D29856579 Pulled By: ngimel fbshipit-source-id: b6bbf27a71e5a04f61779f6fef4ed1c98baa2607
2021-07-23 20:44:20 +00:00
"aten/src/ATen/native/cuda/MultiLabelMarginCriterion.cu.cc",
"aten/src/ATen/native/cuda/NaiveConvolutionTranspose2d.cu.cc",
"aten/src/ATen/native/cuda/NaiveConvolutionTranspose3d.cu.cc",
"aten/src/ATen/native/cuda/NaiveDilatedConvolution.cu.cc",
ENH Migrate nll_loss2d from THC to ATen (#62826) Summary: Fixes https://github.com/pytorch/pytorch/issues/24608 Fixes https://github.com/pytorch/pytorch/issues/24607 With the following benchmark, the backward pass runs a little slower. This is strange since the implementation should be exactly the same. <details> <summary>Benchmark script</summary> ```python from itertools import product import torch import torch.nn as nn import torch.nn.functional as F import time torch.manual_seed(0) MS_PER_SECOND = 1000 def _time(): torch.cuda.synchronize() return time.perf_counter() * MS_PER_SECOND device = "cuda" C = 3 n_runs = 30 reductions = ["none", "sum", "mean"] Ns = [128, 256, 512] Hs = [128, 256, 512] for reduction, N, H in product(reductions, Ns, Hs): total_fwd_time = 0 total_back_time = 0 if reduction == "none": grad_out = torch.randn(N, H, H, device=device) else: grad_out = torch.randn(1)[0] for _ in range(n_runs): input = torch.randn(N, C, H, H, device=device, requires_grad=True) target = torch.rand(N, H, H, device=device).mul(3).floor().long() # forward start = _time() result = F.nll_loss(input, target, reduction=reduction) total_fwd_time += _time() - start result = F.nll_loss(input, target, reduction=reduction) for _ in range(n_runs): # backward start = _time() result.backward(grad_out, retain_graph=True) total_back_time += _time() - start fwd_avg = total_fwd_time / n_runs bwd_avg = total_back_time / n_runs print( f"input size({N}, {C}, {H}, {H}), reduction: {reduction}, fwd: {fwd_avg:.2f} (ms), back: {bwd_avg:.2f} (ms)" ) ``` </details> <details> <summary>master results</summary> ``` input size(128, 3, 128, 128), reduction: none, fwd: 0.34 (ms), back: 0.57 (ms) input size(128, 3, 256, 256), reduction: none, fwd: 2.56 (ms), back: 3.85 (ms) input size(128, 3, 512, 512), reduction: none, fwd: 14.54 (ms), back: 16.62 (ms) input size(256, 3, 128, 128), reduction: none, fwd: 1.26 (ms), back: 1.78 (ms) input size(256, 3, 256, 256), reduction: none, fwd: 7.07 (ms), back: 8.22 (ms) input size(256, 3, 512, 512), reduction: none, fwd: 29.38 (ms), back: 33.29 (ms) input size(512, 3, 128, 128), reduction: none, fwd: 3.41 (ms), back: 4.05 (ms) input size(512, 3, 256, 256), reduction: none, fwd: 14.32 (ms), back: 16.46 (ms) input size(512, 3, 512, 512), reduction: none, fwd: 59.20 (ms), back: 66.68 (ms) input size(128, 3, 128, 128), reduction: sum, fwd: 0.08 (ms), back: 0.21 (ms) input size(128, 3, 256, 256), reduction: sum, fwd: 0.21 (ms), back: 0.73 (ms) input size(128, 3, 512, 512), reduction: sum, fwd: 0.82 (ms), back: 2.86 (ms) input size(256, 3, 128, 128), reduction: sum, fwd: 0.12 (ms), back: 0.39 (ms) input size(256, 3, 256, 256), reduction: sum, fwd: 0.42 (ms), back: 1.45 (ms) input size(256, 3, 512, 512), reduction: sum, fwd: 1.53 (ms), back: 5.66 (ms) input size(512, 3, 128, 128), reduction: sum, fwd: 0.21 (ms), back: 0.74 (ms) input size(512, 3, 256, 256), reduction: sum, fwd: 0.78 (ms), back: 2.86 (ms) input size(512, 3, 512, 512), reduction: sum, fwd: 2.98 (ms), back: 11.23 (ms) input size(128, 3, 128, 128), reduction: mean, fwd: 0.07 (ms), back: 0.21 (ms) input size(128, 3, 256, 256), reduction: mean, fwd: 0.21 (ms), back: 0.73 (ms) input size(128, 3, 512, 512), reduction: mean, fwd: 0.82 (ms), back: 2.86 (ms) input size(256, 3, 128, 128), reduction: mean, fwd: 0.13 (ms), back: 0.39 (ms) input size(256, 3, 256, 256), reduction: mean, fwd: 0.42 (ms), back: 1.45 (ms) input size(256, 3, 512, 512), reduction: mean, fwd: 1.54 (ms), back: 5.65 (ms) input size(512, 3, 128, 128), reduction: mean, fwd: 0.22 (ms), back: 0.74 (ms) input size(512, 3, 256, 256), reduction: mean, fwd: 0.78 (ms), back: 2.87 (ms) input size(512, 3, 512, 512), reduction: mean, fwd: 2.98 (ms), back: 11.23 (ms) ``` </details> <details> <summary>PR results</summary> ``` input size(128, 3, 128, 128), reduction: none, fwd: 0.33 (ms), back: 0.59 (ms) input size(128, 3, 256, 256), reduction: none, fwd: 2.51 (ms), back: 3.92 (ms) input size(128, 3, 512, 512), reduction: none, fwd: 14.52 (ms), back: 17.05 (ms) input size(256, 3, 128, 128), reduction: none, fwd: 1.23 (ms), back: 1.85 (ms) input size(256, 3, 256, 256), reduction: none, fwd: 7.07 (ms), back: 8.45 (ms) input size(256, 3, 512, 512), reduction: none, fwd: 29.39 (ms), back: 34.21 (ms) input size(512, 3, 128, 128), reduction: none, fwd: 3.40 (ms), back: 4.18 (ms) input size(512, 3, 256, 256), reduction: none, fwd: 14.33 (ms), back: 16.90 (ms) input size(512, 3, 512, 512), reduction: none, fwd: 59.04 (ms), back: 68.36 (ms) input size(128, 3, 128, 128), reduction: sum, fwd: 0.07 (ms), back: 0.25 (ms) input size(128, 3, 256, 256), reduction: sum, fwd: 0.21 (ms), back: 0.86 (ms) input size(128, 3, 512, 512), reduction: sum, fwd: 0.82 (ms), back: 3.33 (ms) input size(256, 3, 128, 128), reduction: sum, fwd: 0.12 (ms), back: 0.46 (ms) input size(256, 3, 256, 256), reduction: sum, fwd: 0.42 (ms), back: 1.70 (ms) input size(256, 3, 512, 512), reduction: sum, fwd: 1.53 (ms), back: 6.58 (ms) input size(512, 3, 128, 128), reduction: sum, fwd: 0.21 (ms), back: 0.87 (ms) input size(512, 3, 256, 256), reduction: sum, fwd: 0.78 (ms), back: 3.34 (ms) input size(512, 3, 512, 512), reduction: sum, fwd: 2.98 (ms), back: 13.07 (ms) input size(128, 3, 128, 128), reduction: mean, fwd: 0.07 (ms), back: 0.26 (ms) input size(128, 3, 256, 256), reduction: mean, fwd: 0.21 (ms), back: 0.86 (ms) input size(128, 3, 512, 512), reduction: mean, fwd: 0.82 (ms), back: 3.34 (ms) input size(256, 3, 128, 128), reduction: mean, fwd: 0.12 (ms), back: 0.46 (ms) input size(256, 3, 256, 256), reduction: mean, fwd: 0.42 (ms), back: 1.72 (ms) input size(256, 3, 512, 512), reduction: mean, fwd: 1.53 (ms), back: 6.60 (ms) input size(512, 3, 128, 128), reduction: mean, fwd: 0.21 (ms), back: 0.87 (ms) input size(512, 3, 256, 256), reduction: mean, fwd: 0.78 (ms), back: 3.33 (ms) input size(512, 3, 512, 512), reduction: mean, fwd: 2.98 (ms), back: 13.07 (ms) ``` </details> Pull Request resolved: https://github.com/pytorch/pytorch/pull/62826 Reviewed By: bdhirsh Differential Revision: D30282279 Pulled By: ngimel fbshipit-source-id: 4aa0ff3f8af0632957417931d332ec486a12b52d
2021-08-13 01:05:29 +00:00
"aten/src/ATen/native/cuda/NLLLoss2d.cu.cc",
"aten/src/ATen/native/cuda/Normalization.cu.cc",
"aten/src/ATen/native/cuda/PointwiseOpsKernel.cu.cc",
"aten/src/ATen/native/cuda/PowKernel.cu.cc",
"aten/src/ATen/native/cuda/RNN.cu.cc",
"aten/src/ATen/native/cuda/RangeFactories.cu.cc",
"aten/src/ATen/native/cuda/Reduce.cu.cc",
"aten/src/ATen/native/cuda/ReduceOpsKernel.cu.cc",
"aten/src/ATen/native/cuda/ReflectionPad.cu.cc",
"aten/src/ATen/native/cuda/Repeat.cu.cc",
"aten/src/ATen/native/cuda/ReplicationPadding.cu.cc",
"aten/src/ATen/native/cuda/Resize.cu.cc",
"aten/src/ATen/native/cuda/SegmentReduce.cu.cc",
"aten/src/ATen/native/cuda/SoftMax.cu.cc",
"aten/src/ATen/native/cuda/SortingKthValue.cu.cc",
"aten/src/ATen/native/cuda/SparseMM.cu.cc",
"aten/src/ATen/native/cuda/SpectralOps.cu.cc",
"aten/src/ATen/native/cuda/SummaryOps.cu.cc",
"aten/src/ATen/native/cuda/TensorCompare.cu.cc",
"aten/src/ATen/native/cuda/TensorFactories.cu.cc",
"aten/src/ATen/native/cuda/TensorTopK.cu.cc",
"aten/src/ATen/native/cuda/TensorTransformations.cu.cc",
"aten/src/ATen/native/cuda/TriangularOps.cu.cc",
"aten/src/ATen/native/cuda/UnaryOpsKernel.cu.cc",
[special] Add `i0e` (#54409) Summary: Reference: https://github.com/pytorch/pytorch/issues/50345 Changes: * Add `i0e` * Move some kernels from `UnaryOpsKernel.cu` to `UnarySpecialOpsKernel.cu` to decrease compilation time per file. Time taken by i0e_vs_scipy tests: around 6.33.s <details> <summary>Test Run Log</summary> ``` (pytorch-cuda-dev) kshiteej@qgpu1:~/Pytorch/pytorch_module_special$ pytest test/test_unary_ufuncs.py -k _i0e_vs ======================================================================= test session starts ======================================================================== platform linux -- Python 3.8.6, pytest-6.1.2, py-1.9.0, pluggy-0.13.1 rootdir: /home/kshiteej/Pytorch/pytorch_module_special, configfile: pytest.ini plugins: hypothesis-5.38.1 collected 8843 items / 8833 deselected / 10 selected test/test_unary_ufuncs.py ...sss.... [100%] ========================================================================= warnings summary ========================================================================= ../../.conda/envs/pytorch-cuda-dev/lib/python3.8/site-packages/torch/backends/cudnn/__init__.py:73 test/test_unary_ufuncs.py::TestUnaryUfuncsCUDA::test_special_i0e_vs_scipy_cuda_bfloat16 /home/kshiteej/.conda/envs/pytorch-cuda-dev/lib/python3.8/site-packages/torch/backends/cudnn/__init__.py:73: UserWarning: PyTorch was compiled without cuDNN/MIOpen support. To use cuDNN/MIOpen, rebuild PyTorch making sure the library is visible to the build system. warnings.warn( -- Docs: https://docs.pytest.org/en/stable/warnings.html ===================================================================== short test summary info ====================================================================== SKIPPED [3] test/test_unary_ufuncs.py:1182: not implemented: Could not run 'aten::_copy_from' with arguments from the 'Meta' backend. This could be because the operator doesn't exist for this backend, or was omitted during the selective/custom build process (if using custom build). If you are a Facebook employee using PyTorch on mobile, please visit https://fburl.com/ptmfixes for possible resolutions. 'aten::_copy_from' is only available for these backends: [BackendSelect, Named, InplaceOrView, AutogradOther, AutogradCPU, AutogradCUDA, AutogradXLA, UNKNOWN_TENSOR_TYPE_ID, AutogradMLC, AutogradNestedTensor, AutogradPrivateUse1, AutogradPrivateUse2, AutogradPrivateUse3, Tracer, Autocast, Batched, VmapMode]. BackendSelect: fallthrough registered at ../aten/src/ATen/core/BackendSelectFallbackKernel.cpp:3 [backend fallback] Named: registered at ../aten/src/ATen/core/NamedRegistrations.cpp:7 [backend fallback] InplaceOrView: fallthrough registered at ../aten/src/ATen/core/VariableFallbackKernel.cpp:56 [backend fallback] AutogradOther: registered at ../torch/csrc/autograd/generated/VariableType_4.cpp:8761 [autograd kernel] AutogradCPU: registered at ../torch/csrc/autograd/generated/VariableType_4.cpp:8761 [autograd kernel] AutogradCUDA: registered at ../torch/csrc/autograd/generated/VariableType_4.cpp:8761 [autograd kernel] AutogradXLA: registered at ../torch/csrc/autograd/generated/VariableType_4.cpp:8761 [autograd kernel] UNKNOWN_TENSOR_TYPE_ID: registered at ../torch/csrc/autograd/generated/VariableType_4.cpp:8761 [autograd kernel] AutogradMLC: registered at ../torch/csrc/autograd/generated/VariableType_4.cpp:8761 [autograd kernel] AutogradNestedTensor: registered at ../torch/csrc/autograd/generated/VariableType_4.cpp:8761 [autograd kernel] AutogradPrivateUse1: registered at ../torch/csrc/autograd/generated/VariableType_4.cpp:8761 [autograd kernel] AutogradPrivateUse2: registered at ../torch/csrc/autograd/generated/VariableType_4.cpp:8761 [autograd kernel] AutogradPrivateUse3: registered at ../torch/csrc/autograd/generated/VariableType_4.cpp:8761 [autograd kernel] Tracer: registered at ../torch/csrc/autograd/generated/TraceType_4.cpp:9348 [kernel] Autocast: fallthrough registered at ../aten/src/ATen/autocast_mode.cpp:250 [backend fallback] Batched: registered at ../aten/src/ATen/BatchingRegistrations.cpp:1016 [backend fallback] VmapMode: fallthrough registered at ../aten/src/ATen/VmapModeRegistrations.cpp:33 [backend fallback] ==================================================== 7 passed, 3 skipped, 8833 deselected, 2 warnings in 6.33s ===================================================== ``` </details> TODO: * [x] Check rendered docs (https://11743402-65600975-gh.circle-artifacts.com/0/docs/special.html) Pull Request resolved: https://github.com/pytorch/pytorch/pull/54409 Reviewed By: jbschlosser Differential Revision: D27760472 Pulled By: mruberry fbshipit-source-id: bdfbcaa798b00c51dc9513c34626246c8fc10548
2021-04-15 13:04:44 +00:00
"aten/src/ATen/native/cuda/UnarySpecialOpsKernel.cu.cc",
"aten/src/ATen/native/cuda/Unique.cu.cc",
"aten/src/ATen/native/cuda/UpSampleBicubic2d.cu.cc",
"aten/src/ATen/native/cuda/UpSampleBilinear2d.cu.cc",
"aten/src/ATen/native/cuda/UpSampleLinear1d.cu.cc",
"aten/src/ATen/native/cuda/UpSampleNearest1d.cu.cc",
"aten/src/ATen/native/cuda/UpSampleNearest2d.cu.cc",
"aten/src/ATen/native/cuda/UpSampleNearest3d.cu.cc",
"aten/src/ATen/native/cuda/UpSampleTrilinear3d.cu.cc",
"aten/src/ATen/native/cuda/WeightNorm.cu.cc",
"aten/src/ATen/native/cuda/layer_norm_kernel.cu.cc",
"aten/src/ATen/native/quantized/cuda/fake_quantize_core.cu.cc",
"aten/src/ATen/native/sparse/cuda/SparseCUDABlas.cu.cc",
"aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cu.cc",
"aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu.cc",
],
)
header_template_rule(
name = "aten_src_ATen_config",
src = "aten/src/ATen/Config.h.in",
out = "aten/src/ATen/Config.h",
substitutions = {
"@AT_MKLDNN_ENABLED@": "1",
"@AT_MKL_ENABLED@": "0",
"@AT_FFTW_ENABLED@": "0",
"@AT_POCKETFFT_ENABLED@": "0",
"@AT_NNPACK_ENABLED@": "0",
"@CAFFE2_STATIC_LINK_CUDA_INT@": "0",
"@AT_BUILD_WITH_BLAS@": "1",
"@AT_BUILD_WITH_LAPACK@": "1",
"@AT_PARALLEL_OPENMP@": "0",
"@AT_PARALLEL_NATIVE@": "1",
"@AT_PARALLEL_NATIVE_TBB@": "0",
"@AT_BLAS_F2C@": "0",
"@AT_BLAS_USE_CBLAS_DOT@": "1",
},
)
header_template_rule(
name = "aten_src_ATen_cuda_config",
src = "aten/src/ATen/cuda/CUDAConfig.h.in",
out = "aten/src/ATen/cuda/CUDAConfig.h",
substitutions = {
"@AT_CUDNN_ENABLED@": "1",
"@AT_ROCM_ENABLED@": "0",
"@AT_MAGMA_ENABLED@": "0",
"@NVCC_FLAGS_EXTRA@": "",
},
)
cc_library(
name = "aten_headers",
hdrs = [
"torch/csrc/Export.h",
"torch/csrc/jit/frontend/function_schema_parser.h",
] + glob([
"aten/src/**/*.h",
"aten/src/**/*.hpp",
"aten/src/TH/**/*.cpp",
"aten/src/THC/*.cuh",
],
exclude = [
"aten/src/ATen/Config.h",
],) + [
":generated_cpp",
":aten_src_ATen_config",
],
includes = [
"aten/src",
"aten/src/TH",
],
deps = [
":c10_headers",
],
)
ATEN_COPTS = COMMON_COPTS + [
"-DUSE_AVX",
"-DUSE_AVX2",
"-DCAFFE2_BUILD_MAIN_LIBS",
"-DHAVE_AVX_CPU_DEFINITION",
"-DHAVE_AVX2_CPU_DEFINITION",
"-fvisibility-inlines-hidden",
"-fno-math-errno",
"-fno-trapping-math",
]
intern_build_aten_ops(
copts = ATEN_COPTS,
deps = [
":aten_headers",
"@sleef",
"@fbgemm",
],
)
cc_library(
name = "aten",
srcs = [
":ATen_CORE_SRCS",
":ATen_QUANTIZED_SRCS",
":aten_base_cpp",
":aten_base_metal",
":aten_base_vulkan",
":aten_native_cpp",
":aten_native_mkl_cpp",
":aten_native_mkldnn_cpp",
":aten_native_quantized_cpp",
":aten_native_sparse_cpp",
":aten_native_xnnpack",
":aten_src_ATen_config",
":generated_cpp",
],
copts = ATEN_COPTS,
data = if_cuda(
[":libcaffe2_nvrtc.so"],
[],
),
visibility = ["//visibility:public"],
deps = [
":ATen_CPU",
":aten_headers",
":caffe2_for_aten_headers",
":torch_headers",
"@fbgemm",
"@ideep",
],
alwayslink = True,
)
cc_library(
name = "aten_nvrtc",
srcs = glob([
"aten/src/ATen/cuda/nvrtc_stub/*.cpp",
]),
copts = ATEN_COPTS,
linkstatic = True,
visibility = ["//visibility:public"],
deps = [
":aten_headers",
":c10_headers",
"@cuda",
"@cuda//:cuda_driver",
"@cuda//:nvrtc",
],
alwayslink = True,
)
cc_binary(
name = "libcaffe2_nvrtc.so",
linkshared = True,
visibility = ["//visibility:public"],
deps = [
":aten_nvrtc",
],
)
cc_library(
name = "aten_cuda_cpp",
srcs = [":aten_cuda_srcs"],
copts = ATEN_COPTS,
visibility = ["//visibility:public"],
deps = [
":aten",
"@cuda",
"@cuda//:nvrtc",
"@cudnn",
],
alwayslink = True,
)
torch_cuda_half_options = [
"-DCUDA_HAS_FP16=1",
"-D__CUDA_NO_HALF_OPERATORS__",
"-D__CUDA_NO_HALF_CONVERSIONS__",
"-D__CUDA_NO_BFLOAT16_CONVERSIONS__",
"-D__CUDA_NO_HALF2_OPERATORS__",
]
cu_library(
name = "aten_cuda",
srcs = [
":aten_srcs_cu",
],
copts = ATEN_COPTS + torch_cuda_half_options,
visibility = ["//visibility:public"],
deps = [
":aten_cuda_cpp",
"@cuda//:cublas",
"@cuda//:cufft",
"@cuda//:cusparse",
],
alwayslink = True,
)
# caffe2
CAFFE2_COPTS = COMMON_COPTS + [
"-Dcaffe2_EXPORTS",
"-DCAFFE2_USE_GLOO",
"-DCAFFE2_USE_CUDNN",
"-DCAFFE2_BUILD_MAIN_LIB",
"-fvisibility-inlines-hidden",
"-fno-math-errno",
"-fno-trapping-math",
]
proto_library(
name = "caffe2_proto_source",
srcs = glob([
"caffe2/proto/*.proto",
]),
visibility = ["//visibility:public"],
)
cc_proto_library(
name = "caffe2_protos",
deps = [":caffe2_proto_source"],
)
header_template_rule(
name = "caffe2_core_macros_h",
src = "caffe2/core/macros.h.in",
out = "caffe2/core/macros.h",
substitutions = {
"@CAFFE2_VERSION_MAJOR@": "1",
"@CAFFE2_VERSION_MINOR@": "3",
"@CAFFE2_VERSION_PATCH@": "0",
"cmakedefine": "define",
"#define CAFFE2_FORCE_FALLBACK_CUDA_MPI": "/* #undef CAFFE2_FORCE_FALLBACK_CUDA_MPI */",
"#define CAFFE2_HAS_MKL_DNN": "/* #undef CAFFE2_HAS_MKL_DNN */",
"#define CAFFE2_HAS_MKL_SGEMM_PACK": "/* #undef CAFFE2_HAS_MKL_SGEMM_PACK */",
"#define CAFFE2_THREADPOOL_MAIN_IMBALANCE": "/* #undef CAFFE2_THREADPOOL_MAIN_IMBALANCE */",
"#define CAFFE2_THREADPOOL_STATS": "/* #undef CAFFE2_THREADPOOL_STATS */",
"#define CAFFE2_USE_ACCELERATE": "/* #undef CAFFE2_USE_ACCELERATE */",
"#define CAFFE2_USE_EIGEN_FOR_BLAS": "/* #undef CAFFE2_USE_EIGEN_FOR_BLAS */",
"#define CAFFE2_USE_FBCODE": "/* #undef CAFFE2_USE_FBCODE */",
"#define CAFFE2_USE_GOOGLE_GLOG": "/* #undef CAFFE2_USE_GOOGLE_GLOG */",
"#define CAFFE2_USE_LITE_PROTO": "/* #undef CAFFE2_USE_LITE_PROTO */",
"#define CAFFE2_USE_MKL\n": "/* #undef CAFFE2_USE_MKL */\n",
"#define CAFFE2_USE_NVTX": "/* #undef CAFFE2_USE_NVTX */",
"#define CAFFE2_USE_TRT": "/* #undef CAFFE2_USE_TRT */",
},
)
filegroup(
name = "caffe2_contrib_srcs",
srcs = [
"caffe2/contrib/gloo/allgather_ops.cc",
"caffe2/contrib/gloo/allreduce_ops.cc",
"caffe2/contrib/gloo/barrier_ops.cc",
"caffe2/contrib/gloo/broadcast_ops.cc",
"caffe2/contrib/gloo/common.cc",
"caffe2/contrib/gloo/common_world_ops.cc",
"caffe2/contrib/gloo/context.cc",
"caffe2/contrib/gloo/reduce_scatter_ops.cc",
"caffe2/contrib/gloo/store_handler.cc",
],
)
filegroup(
name = "caffe2_core_srcs",
srcs = [
"caffe2/core/allocator.cc",
"caffe2/core/blob_serialization.cc",
"caffe2/core/blob_stats.cc",
"caffe2/core/common.cc",
"caffe2/core/context.cc",
"caffe2/core/context_base.cc",
"caffe2/core/db.cc",
"caffe2/core/event.cc",
"caffe2/core/export_c10_op_to_caffe2.cc",
"caffe2/core/graph.cc",
"caffe2/core/init.cc",
"caffe2/core/init_denormals.cc",
"caffe2/core/init_intrinsics_check.cc",
"caffe2/core/init_omp.cc",
"caffe2/core/int8_serialization.cc",
"caffe2/core/memonger.cc",
"caffe2/core/module.cc",
"caffe2/core/net.cc",
"caffe2/core/net_async_base.cc",
"caffe2/core/net_async_scheduling.cc",
"caffe2/core/net_async_task.cc",
"caffe2/core/net_async_task_future.cc",
"caffe2/core/net_async_task_graph.cc",
"caffe2/core/net_async_tracing.cc",
"caffe2/core/net_dag_utils.cc",
"caffe2/core/net_parallel.cc",
"caffe2/core/net_simple.cc",
"caffe2/core/net_simple_refcount.cc",
"caffe2/core/nomnigraph/Representations/NeuralNet.cc",
"caffe2/core/nomnigraph/tests/test_util.cc",
"caffe2/core/numa.cc",
"caffe2/core/operator.cc",
"caffe2/core/operator_schema.cc",
"caffe2/core/plan_executor.cc",
"caffe2/core/prof_dag_counters.cc",
"caffe2/core/qtensor.cc",
"caffe2/core/qtensor_serialization.cc",
"caffe2/core/stats.cc",
"caffe2/core/tensor.cc",
"caffe2/core/tensor_int8.cc",
"caffe2/core/test_utils.cc",
"caffe2/core/transform.cc",
"caffe2/core/types.cc",
"caffe2/core/workspace.cc",
],
)
filegroup(
name = "caffe2_distributed_srcs",
srcs = [
"caffe2/distributed/file_store_handler.cc",
"caffe2/distributed/file_store_handler_op.cc",
"caffe2/distributed/store_handler.cc",
"caffe2/distributed/store_ops.cc",
],
)
filegroup(
name = "caffe2_ideep_srcs",
srcs = [
"caffe2/ideep/operators/adam_op.cc",
"caffe2/ideep/operators/channel_shuffle_op.cc",
"caffe2/ideep/operators/concat_split_op.cc",
"caffe2/ideep/operators/conv_op.cc",
"caffe2/ideep/operators/conv_transpose_op.cc",
"caffe2/ideep/operators/dropout_op.cc",
"caffe2/ideep/operators/elementwise_sum_op.cc",
"caffe2/ideep/operators/expand_squeeze_dims_op.cc",
"caffe2/ideep/operators/fully_connected_op.cc",
"caffe2/ideep/operators/local_response_normalization_op.cc",
"caffe2/ideep/operators/momentum_sgd_op.cc",
"caffe2/ideep/operators/operator_fallback_ideep.cc",
"caffe2/ideep/operators/order_switch_ops.cc",
"caffe2/ideep/operators/pool_op.cc",
"caffe2/ideep/operators/quantization/int8_add_op.cc",
"caffe2/ideep/operators/quantization/int8_conv_op.cc",
"caffe2/ideep/operators/quantization/int8_dequantize_op.cc",
"caffe2/ideep/operators/quantization/int8_fully_connected_op.cc",
"caffe2/ideep/operators/quantization/int8_given_tensor_fill_op.cc",
"caffe2/ideep/operators/quantization/int8_pool_op.cc",
"caffe2/ideep/operators/quantization/int8_quantize_op.cc",
"caffe2/ideep/operators/quantization/int8_relu_op.cc",
"caffe2/ideep/operators/queue_ops.cc",
"caffe2/ideep/operators/relu_op.cc",
"caffe2/ideep/operators/reshape_op.cc",
"caffe2/ideep/operators/shape_op.cc",
"caffe2/ideep/operators/sigmoid_op.cc",
"caffe2/ideep/operators/spatial_batch_norm_op.cc",
"caffe2/ideep/operators/transpose_op.cc",
"caffe2/ideep/operators/utility_ops.cc",
"caffe2/ideep/utils/ideep_register.cc",
],
)
filegroup(
name = "caffe2_onnx_srcs",
srcs = [
"caffe2/onnx/backend.cc",
"caffe2/onnx/backend_rep.cc",
"caffe2/onnx/device.cc",
"caffe2/onnx/helper.cc",
"caffe2/onnx/offline_tensor.cc",
"caffe2/onnx/onnx_exporter.cc",
"caffe2/onnx/onnxifi_graph_info.cc",
"caffe2/onnx/onnxifi_init.cc",
],
)
filegroup(
name = "caffe2_operators_srcs",
srcs = [
"caffe2/operators/abs_op.cc",
"caffe2/operators/accumulate_op.cc",
"caffe2/operators/accuracy_op.cc",
"caffe2/operators/acos_op.cc",
"caffe2/operators/affine_channel_op.cc",
"caffe2/operators/alias_with_name.cc",
"caffe2/operators/apmeter_op.cc",
"caffe2/operators/arg_ops.cc",
"caffe2/operators/asin_op.cc",
"caffe2/operators/assert_op.cc",
"caffe2/operators/atan_op.cc",
"caffe2/operators/atomic_ops.cc",
"caffe2/operators/batch_box_cox_op.cc",
"caffe2/operators/batch_bucketize_op.cc",
"caffe2/operators/batch_gather_ops.cc",
"caffe2/operators/batch_matmul_op.cc",
"caffe2/operators/batch_moments_op.cc",
"caffe2/operators/batch_permutation_op.cc",
"caffe2/operators/batch_sparse_to_dense_op.cc",
"caffe2/operators/bbox_transform_op.cc",
"caffe2/operators/bisect_percentile_op.cc",
"caffe2/operators/boolean_mask_ops.cc",
"caffe2/operators/boolean_unmask_ops.cc",
"caffe2/operators/box_with_nms_limit_op.cc",
"caffe2/operators/bucketize_op.cc",
"caffe2/operators/byte_weight_dequant_op.cc",
"caffe2/operators/cast_op.cc",
"caffe2/operators/cbrt_op.cc",
"caffe2/operators/cc_bmm_bg_op.cc",
"caffe2/operators/ceil_op.cc",
"caffe2/operators/channel_backprop_stats_op.cc",
"caffe2/operators/channel_shuffle_op.cc",
"caffe2/operators/channel_stats_op.cc",
"caffe2/operators/clip_op.cc",
"caffe2/operators/collect_and_distribute_fpn_rpn_proposals_op.cc",
"caffe2/operators/communicator_op.cc",
"caffe2/operators/concat_split_op.cc",
"caffe2/operators/conditional_op.cc",
"caffe2/operators/conv_gradient_op.cc",
"caffe2/operators/conv_op.cc",
"caffe2/operators/conv_op_eigen.cc",
"caffe2/operators/conv_op_shared.cc",
"caffe2/operators/conv_transpose_gradient_op.cc",
"caffe2/operators/conv_transpose_op_mobile.cc",
"caffe2/operators/copy_op.cc",
"caffe2/operators/copy_rows_to_tensor_op.cc",
"caffe2/operators/cos_op.cc",
"caffe2/operators/cosh_op.cc",
"caffe2/operators/cosine_embedding_criterion_op.cc",
"caffe2/operators/counter_ops.cc",
"caffe2/operators/crash_op.cc",
"caffe2/operators/create_scope_op.cc",
"caffe2/operators/crf_viterbi_op.cc",
"caffe2/operators/cross_entropy_op.cc",
"caffe2/operators/ctc_beam_search_decoder_op.cc",
"caffe2/operators/ctc_greedy_decoder_op.cc",
"caffe2/operators/cube_op.cc",
"caffe2/operators/data_couple.cc",
"caffe2/operators/dataset_ops.cc",
"caffe2/operators/deform_conv_gradient_op.cc",
"caffe2/operators/deform_conv_op.cc",
"caffe2/operators/dense_vector_to_id_list_op.cc",
"caffe2/operators/distance_op.cc",
"caffe2/operators/do_op.cc",
"caffe2/operators/dropout_op.cc",
"caffe2/operators/elementwise_add_gradient_op.cc",
"caffe2/operators/elementwise_add_op.cc",
"caffe2/operators/elementwise_div_gradient_op.cc",
"caffe2/operators/elementwise_div_op.cc",
"caffe2/operators/elementwise_linear_op.cc",
"caffe2/operators/elementwise_logical_ops.cc",
"caffe2/operators/elementwise_mul_gradient_op.cc",
"caffe2/operators/elementwise_mul_op.cc",
"caffe2/operators/elementwise_ops.cc",
"caffe2/operators/elementwise_ops_schema.cc",
"caffe2/operators/elementwise_ops_utils.cc",
"caffe2/operators/elementwise_sub_gradient_op.cc",
"caffe2/operators/elementwise_sub_op.cc",
"caffe2/operators/elementwise_sum_op.cc",
"caffe2/operators/elu_op.cc",
"caffe2/operators/enforce_finite_op.cc",
"caffe2/operators/ensure_clipped_op.cc",
"caffe2/operators/ensure_cpu_output_op.cc",
"caffe2/operators/erf_op.cc",
"caffe2/operators/exp_op.cc",
"caffe2/operators/expand_op.cc",
"caffe2/operators/expand_squeeze_dims_op.cc",
"caffe2/operators/fc_inference.cc",
"caffe2/operators/feature_maps_ops.cc",
"caffe2/operators/feed_blob_op.cc",
"caffe2/operators/filler_op.cc",
"caffe2/operators/find_duplicate_elements_op.cc",
"caffe2/operators/find_op.cc",
"caffe2/operators/flatten_op.cc",
"caffe2/operators/flexible_top_k.cc",
"caffe2/operators/floor_op.cc",
"caffe2/operators/free_op.cc",
"caffe2/operators/fully_connected_op.cc",
"caffe2/operators/fused_rowwise_8bit_conversion_ops.cc",
"caffe2/operators/fused_rowwise_random_quantization_ops.cc",
"caffe2/operators/gather_fused_8bit_rowwise_op.cc",
"caffe2/operators/gather_op.cc",
"caffe2/operators/gather_ranges_to_dense_op.cc",
"caffe2/operators/gelu_op.cc",
"caffe2/operators/generate_proposals_op.cc",
"caffe2/operators/given_tensor_byte_string_to_uint8_fill_op.cc",
"caffe2/operators/given_tensor_fill_op.cc",
"caffe2/operators/glu_op.cc",
"caffe2/operators/group_norm_op.cc",
"caffe2/operators/gru_unit_op.cc",
"caffe2/operators/h_softmax_op.cc",
"caffe2/operators/half_float_ops.cc",
"caffe2/operators/hard_sigmoid_op.cc",
"caffe2/operators/heatmap_max_keypoint_op.cc",
"caffe2/operators/if_op.cc",
"caffe2/operators/im2col_op.cc",
"caffe2/operators/index_hash_ops.cc",
"caffe2/operators/index_ops.cc",
"caffe2/operators/inference_lstm_op.cc",
"caffe2/operators/instance_norm_gradient_op.cc",
"caffe2/operators/instance_norm_op.cc",
"caffe2/operators/integral_image_op.cc",
"caffe2/operators/is_empty_op.cc",
"caffe2/operators/jsd_op.cc",
"caffe2/operators/key_split_ops.cc",
"caffe2/operators/last_n_window_collector.cc",
"caffe2/operators/layer_norm_op.cc",
"caffe2/operators/leaky_relu_op.cc",
"caffe2/operators/length_split_op.cc",
"caffe2/operators/lengths_pad_op.cc",
"caffe2/operators/lengths_reducer_fused_8bit_rowwise_ops.cc",
"caffe2/operators/lengths_reducer_ops.cc",
"caffe2/operators/lengths_reducer_rowwise_8bit_ops.cc",
"caffe2/operators/lengths_tile_op.cc",
"caffe2/operators/lengths_top_k_op.cc",
"caffe2/operators/listwise_l2r_op.cc",
"caffe2/operators/load_save_op.cc",
"caffe2/operators/load_save_op_util.cc",
"caffe2/operators/local_response_normalization_op.cc",
"caffe2/operators/locally_connected_op.cc",
"caffe2/operators/locally_connected_op_util.cc",
"caffe2/operators/log_op.cc",
"caffe2/operators/logit_op.cc",
"caffe2/operators/loss_op.cc",
"caffe2/operators/lp_pool_op.cc",
"caffe2/operators/lpnorm_op.cc",
"caffe2/operators/lstm_unit_op.cc",
"caffe2/operators/map_ops.cc",
"caffe2/operators/margin_ranking_criterion_op.cc",
"caffe2/operators/matmul_op.cc",
"caffe2/operators/mean_op.cc",
"caffe2/operators/merge_id_lists_op.cc",
"caffe2/operators/minmax_gradient_ops.cc",
"caffe2/operators/minmax_ops.cc",
"caffe2/operators/mod_op.cc",
"caffe2/operators/moments_op.cc",
"caffe2/operators/multi_class_accuracy_op.cc",
"caffe2/operators/negate_gradient_op.cc",
"caffe2/operators/negative_op.cc",
"caffe2/operators/ngram_ops.cc",
"caffe2/operators/norm_planar_yuv_op.cc",
"caffe2/operators/normalize_l1_op.cc",
"caffe2/operators/normalize_op.cc",
"caffe2/operators/numpy_tile_op.cc",
"caffe2/operators/one_hot_ops.cc",
"caffe2/operators/onnx_while_op.cc",
"caffe2/operators/order_switch_ops.cc",
"caffe2/operators/pack_rnn_sequence_op.cc",
"caffe2/operators/pack_segments.cc",
"caffe2/operators/pad_op.cc",
"caffe2/operators/partition_ops.cc",
"caffe2/operators/percentile_op.cc",
"caffe2/operators/perplexity_op.cc",
"caffe2/operators/piecewise_linear_transform_op.cc",
"caffe2/operators/pool_gradient_op.cc",
"caffe2/operators/pool_op.cc",
"caffe2/operators/pool_op_util.cc",
"caffe2/operators/pow_op.cc",
"caffe2/operators/prelu_op.cc",
"caffe2/operators/prepend_dim_op.cc",
"caffe2/operators/quant_decode_op.cc",
"caffe2/operators/rank_loss_op.cc",
"caffe2/operators/reciprocal_gradient_op.cc",
"caffe2/operators/reciprocal_op.cc",
"caffe2/operators/reduce_front_back_max_ops.cc",
"caffe2/operators/reduce_front_back_mean_ops.cc",
"caffe2/operators/reduce_front_back_sum_ops.cc",
"caffe2/operators/reduce_ops.cc",
"caffe2/operators/reduction_ops.cc",
"caffe2/operators/relu_n_op.cc",
"caffe2/operators/relu_op.cc",
"caffe2/operators/remove_data_blocks_op.cc",
"caffe2/operators/replace_nan_op.cc",
"caffe2/operators/reservoir_sampling.cc",
"caffe2/operators/reshape_op.cc",
"caffe2/operators/resize_3d_op.cc",
"caffe2/operators/resize_op.cc",
"caffe2/operators/reverse_packed_segs_op.cc",
"caffe2/operators/rmac_regions_op.cc",
"caffe2/operators/rnn/recurrent_network_blob_fetcher_op.cc",
"caffe2/operators/rnn/recurrent_network_executor.cc",
"caffe2/operators/rnn/recurrent_network_op.cc",
"caffe2/operators/roi_align_gradient_op.cc",
"caffe2/operators/roi_align_op.cc",
"caffe2/operators/roi_align_rotated_gradient_op.cc",
"caffe2/operators/roi_align_rotated_op.cc",
"caffe2/operators/roi_pool_op.cc",
"caffe2/operators/rowmul_op.cc",
"caffe2/operators/rsqrt_op.cc",
"caffe2/operators/scale_blobs_op.cc",
"caffe2/operators/scale_op.cc",
"caffe2/operators/segment_reduction_op.cc",
"caffe2/operators/selu_op.cc",
"caffe2/operators/sequence_ops.cc",
"caffe2/operators/shape_op.cc",
"caffe2/operators/sigmoid_gradient_op.cc",
"caffe2/operators/sigmoid_op.cc",
"caffe2/operators/sin_op.cc",
"caffe2/operators/sinh_op.cc",
"caffe2/operators/sinusoid_position_encoding_op.cc",
"caffe2/operators/slice_op.cc",
"caffe2/operators/softmax_op.cc",
"caffe2/operators/softmax_utils.cc",
"caffe2/operators/softmax_with_loss_op.cc",
"caffe2/operators/softplus_op.cc",
"caffe2/operators/softsign_op.cc",
"caffe2/operators/space_batch_op.cc",
"caffe2/operators/sparse_dropout_with_replacement_op.cc",
"caffe2/operators/sparse_normalize_op.cc",
"caffe2/operators/sparse_to_dense_mask_op.cc",
"caffe2/operators/sparse_to_dense_op.cc",
"caffe2/operators/spatial_batch_norm_gradient_op.cc",
"caffe2/operators/spatial_batch_norm_op.cc",
"caffe2/operators/spatial_softmax_with_loss_op.cc",
"caffe2/operators/sqr_op.cc",
"caffe2/operators/sqrt_op.cc",
"caffe2/operators/square_root_divide_op.cc",
"caffe2/operators/stats_ops.cc",
"caffe2/operators/stats_put_ops.cc",
"caffe2/operators/stop_gradient.cc",
"caffe2/operators/string_ops.cc",
"caffe2/operators/stump_func_op.cc",
"caffe2/operators/stylizer_ops.cc",
"caffe2/operators/summarize_op.cc",
"caffe2/operators/swish_op.cc",
"caffe2/operators/tan_op.cc",
"caffe2/operators/tanh_gradient_op.cc",
"caffe2/operators/tanh_op.cc",
"caffe2/operators/tensor_protos_db_input.cc",
"caffe2/operators/text_file_reader.cc",
"caffe2/operators/text_file_reader_utils.cc",
"caffe2/operators/thresholded_relu_op.cc",
"caffe2/operators/tile_op.cc",
"caffe2/operators/top_k.cc",
"caffe2/operators/transpose_op.cc",
"caffe2/operators/tt_linear_op.cc",
"caffe2/operators/unique_ops.cc",
"caffe2/operators/upsample_op.cc",
"caffe2/operators/utility_ops.cc",
"caffe2/operators/variable_length_sequence_padding.cc",
"caffe2/operators/weighted_multi_sampling_op.cc",
"caffe2/operators/weighted_sample_op.cc",
"caffe2/operators/while_op.cc",
"caffe2/operators/workspace_ops.cc",
"caffe2/operators/zero_gradient_op.cc",
],
)
filegroup(
name = "caffe2_opt_srcs",
srcs = [
"caffe2/opt/annotations.cc",
"caffe2/opt/backend_cutting.cc",
"caffe2/opt/backend_transformer_base.cc",
"caffe2/opt/bound_shape_inferencer.cc",
"caffe2/opt/converter.cc",
"caffe2/opt/dead_code_elim.cc",
"caffe2/opt/device.cc",
"caffe2/opt/distributed.cc",
"caffe2/opt/distributed_converter.cc",
"caffe2/opt/fusion.cc",
"caffe2/opt/mobile.cc",
"caffe2/opt/onnxifi_op.cc",
"caffe2/opt/onnxifi_transformer.cc",
"caffe2/opt/optimize_ideep.cc",
"caffe2/opt/optimizer.cc",
"caffe2/opt/passes.cc",
"caffe2/opt/shape_info.cc",
"caffe2/opt/tvm_transformer.cc",
],
)
filegroup(
name = "caffe2_perfkernels_srcs",
srcs = [
"caffe2/perfkernels/adagrad.cc",
"caffe2/perfkernels/embedding_lookup.cc",
"caffe2/perfkernels/embedding_lookup_idx.cc",
"caffe2/perfkernels/fused_8bit_rowwise_embedding_lookup.cc",
"caffe2/perfkernels/fused_8bit_rowwise_embedding_lookup_idx.cc",
"caffe2/perfkernels/fused_nbit_rowwise_conversion.cc",
"caffe2/perfkernels/lstm_unit_cpu_common.cc",
"caffe2/perfkernels/math_cpu_base.cc",
"caffe2/perfkernels/typed_axpy.cc",
],
)
filegroup(
name = "caffe2_predictor_srcs",
srcs = [
"caffe2/predictor/emulator/data_filler.cc",
"caffe2/predictor/emulator/data_filler.h",
"caffe2/predictor/predictor.cc",
"caffe2/predictor/predictor_config.cc",
"caffe2/predictor/predictor_utils.cc",
],
)
filegroup(
name = "caffe2_quantization_srcs",
srcs = [
"caffe2/quantization/server/activation_distribution_observer.cc",
"caffe2/quantization/server/batch_matmul_dnnlowp_op.cc",
"caffe2/quantization/server/caffe2_dnnlowp_utils.cc",
"caffe2/quantization/server/channel_shuffle_dnnlowp_op.cc",
"caffe2/quantization/server/concat_dnnlowp_op.cc",
"caffe2/quantization/server/conv_dnnlowp_acc16_op.cc",
"caffe2/quantization/server/conv_dnnlowp_op.cc",
"caffe2/quantization/server/conv_relu_op.cc",
"caffe2/quantization/server/dequantize_dnnlowp_op.cc",
"caffe2/quantization/server/dnnlowp.cc",
"caffe2/quantization/server/dnnlowp_partition.cc",
"caffe2/quantization/server/dynamic_histogram.cc",
"caffe2/quantization/server/elementwise_add_dnnlowp_op.cc",
"caffe2/quantization/server/elementwise_linear_dnnlowp_op.cc",
"caffe2/quantization/server/elementwise_mul_dnnlowp_op.cc",
"caffe2/quantization/server/elementwise_sum_dnnlowp_op.cc",
"caffe2/quantization/server/elementwise_sum_relu_op.cc",
"caffe2/quantization/server/fbgemm_pack_matrix_cache.cc",
"caffe2/quantization/server/fbgemm_pack_op.cc",
"caffe2/quantization/server/fully_connected_dnnlowp_acc16_op.cc",
"caffe2/quantization/server/fully_connected_dnnlowp_op.cc",
"caffe2/quantization/server/fully_connected_fake_lowp_op.cc",
"caffe2/quantization/server/group_norm_dnnlowp_op.cc",
"caffe2/quantization/server/int8_gen_quant_params.cc",
"caffe2/quantization/server/kl_minimization.cc",
"caffe2/quantization/server/lstm_unit_dnnlowp_op.cc",
"caffe2/quantization/server/norm_minimization.cc",
"caffe2/quantization/server/p99.cc",
"caffe2/quantization/server/pool_dnnlowp_op.cc",
"caffe2/quantization/server/quantize_dnnlowp_op.cc",
"caffe2/quantization/server/relu_dnnlowp_op.cc",
"caffe2/quantization/server/sigmoid.cc",
"caffe2/quantization/server/sigmoid_dnnlowp_op.cc",
"caffe2/quantization/server/spatial_batch_norm_dnnlowp_op.cc",
"caffe2/quantization/server/tanh.cc",
"caffe2/quantization/server/tanh_dnnlowp_op.cc",
"caffe2/quantization/server/utility_dnnlowp_ops.cc",
],
)
filegroup(
name = "caffe2_queue_srcs",
srcs = [
"caffe2/queue/blobs_queue.cc",
"caffe2/queue/blobs_queue_db.cc",
"caffe2/queue/queue_ops.cc",
"caffe2/queue/rebatching_queue.cc",
"caffe2/queue/rebatching_queue_ops.cc",
],
)
filegroup(
name = "caffe2_serialize_srcs",
srcs = [
"caffe2/serialize/file_adapter.cc",
"caffe2/serialize/inline_container.cc",
"caffe2/serialize/istream_adapter.cc",
"caffe2/serialize/read_adapter_interface.cc",
],
)
filegroup(
name = "caffe2_sgd_srcs",
srcs = [
"caffe2/sgd/adadelta_op.cc",
"caffe2/sgd/adagrad_op.cc",
"caffe2/sgd/adam_op.cc",
"caffe2/sgd/clip_tensor_op.cc",
"caffe2/sgd/ftrl_op.cc",
"caffe2/sgd/gftrl_op.cc",
"caffe2/sgd/iter_op.cc",
"caffe2/sgd/lars_op.cc",
"caffe2/sgd/learning_rate_adaption_op.cc",
"caffe2/sgd/learning_rate_op.cc",
"caffe2/sgd/momentum_sgd_op.cc",
"caffe2/sgd/rmsprop_op.cc",
"caffe2/sgd/wngrad_op.cc",
"caffe2/sgd/yellowfin_op.cc",
],
)
filegroup(
name = "caffe2_transforms_srcs",
srcs = [
"caffe2/transforms/common_subexpression_elimination.cc",
"caffe2/transforms/conv_to_nnpack_transform.cc",
"caffe2/transforms/pattern_net_transform.cc",
"caffe2/transforms/single_op_transform.cc",
],
)
filegroup(
name = "caffe2_utils_srcs",
srcs = [
"caffe2/utils/bench_utils.cc",
"caffe2/utils/cpuid.cc",
"caffe2/utils/math/broadcast.cc",
"caffe2/utils/math/elementwise.cc",
"caffe2/utils/math/reduce.cc",
"caffe2/utils/math/transpose.cc",
"caffe2/utils/math/utils.cc",
"caffe2/utils/math_cpu.cc",
"caffe2/utils/murmur_hash3.cc",
"caffe2/utils/proto_convert.cc",
"caffe2/utils/proto_utils.cc",
"caffe2/utils/proto_wrap.cc",
"caffe2/utils/signal_handler.cc",
"caffe2/utils/smart_tensor_printer.cc",
"caffe2/utils/string_utils.cc",
"caffe2/utils/threadpool/ThreadPool.cc",
"caffe2/utils/threadpool/pthreadpool.cc",
"caffe2/utils/threadpool/pthreadpool_impl.cc",
"caffe2/utils/threadpool/thread_pool_guard.cpp",
],
)
filegroup(
name = "caffe2_cuda_srcs",
srcs = [
"caffe2/contrib/aten/aten_op_gpu.cc",
"caffe2/contrib/gloo/allreduce_ops_gpu.cc",
"caffe2/contrib/gloo/broadcast_ops_gpu.cc",
"caffe2/contrib/gloo/common_world_ops_gpu.cc",
"caffe2/core/blob_serialization_gpu.cc",
"caffe2/core/common_cudnn.cc",
"caffe2/core/common_gpu.cc",
"caffe2/core/event_gpu.cc",
"caffe2/db/create_db_op_gpu.cc",
"caffe2/distributed/file_store_handler_op_gpu.cc",
"caffe2/operators/communicator_op_gpu.cc",
"caffe2/operators/concat_split_op_gpu.cc",
"caffe2/operators/conv_op_cache_cudnn.cc",
"caffe2/operators/conv_op_cudnn.cc",
"caffe2/operators/conv_op_gpu.cc",
"caffe2/operators/conv_op_shared_gpu.cc",
"caffe2/operators/conv_transpose_op_cudnn.cc",
"caffe2/operators/conv_transpose_op_gpu.cc",
"caffe2/operators/counter_ops_gpu.cc",
"caffe2/operators/do_op_gpu.cc",
"caffe2/operators/dropout_op_cudnn.cc",
"caffe2/operators/elementwise_add_op_gpu.cc",
"caffe2/operators/elementwise_sub_op_gpu.cc",
"caffe2/operators/elu_op_cudnn.cc",
"caffe2/operators/exp_op_gpu.cc",
"caffe2/operators/expand_op_gpu.cc",
"caffe2/operators/expand_squeeze_dims_op_gpu.cc",
"caffe2/operators/free_op_gpu.cc",
"caffe2/operators/fully_connected_op_gpu.cc",
"caffe2/operators/if_op_gpu.cc",
"caffe2/operators/im2col_op_gpu.cc",
"caffe2/operators/load_save_op_gpu.cc",
"caffe2/operators/local_response_normalization_op_cudnn.cc",
"caffe2/operators/locally_connected_op_gpu.cc",
"caffe2/operators/log_op_gpu.cc",
"caffe2/operators/matmul_op_gpu.cc",
"caffe2/operators/negate_gradient_op_gpu.cc",
"caffe2/operators/negative_op_gpu.cc",
"caffe2/operators/order_switch_ops_cudnn.cc",
"caffe2/operators/order_switch_ops_gpu.cc",
"caffe2/operators/pool_op_cudnn.cc",
"caffe2/operators/prepend_dim_op_gpu.cc",
"caffe2/operators/reshape_op_gpu.cc",
"caffe2/operators/rnn/recurrent_network_blob_fetcher_op_gpu.cc",
"caffe2/operators/rnn/recurrent_network_executor_gpu.cc",
"caffe2/operators/rnn/recurrent_op_cudnn.cc",
"caffe2/operators/scale_op_gpu.cc",
"caffe2/operators/shape_op_gpu.cc",
"caffe2/operators/sigmoid_op_cudnn.cc",
"caffe2/operators/softmax_op_cudnn.cc",
"caffe2/operators/sqr_op_gpu.cc",
"caffe2/operators/sqrt_op_gpu.cc",
"caffe2/operators/stop_gradient_gpu.cc",
"caffe2/operators/tanh_op_cudnn.cc",
"caffe2/operators/tensor_protos_db_input_gpu.cc",
"caffe2/operators/transpose_op_cudnn.cc",
"caffe2/operators/while_op_gpu.cc",
"caffe2/operators/zero_gradient_op_gpu.cc",
"caffe2/queue/queue_ops_gpu.cc",
"caffe2/sgd/iter_op_gpu.cc",
"caffe2/sgd/learning_rate_op_gpu.cc",
],
)
filegroup(
name = "caffe2_cu_srcs",
srcs = [
"caffe2/core/context_gpu.cu.cc",
"caffe2/operators/abs_op.cu.cc",
"caffe2/operators/accumulate_op.cu.cc",
"caffe2/operators/accuracy_op.cu.cc",
"caffe2/operators/acos_op.cu.cc",
"caffe2/operators/affine_channel_op.cu.cc",
"caffe2/operators/alias_with_name.cu.cc",
"caffe2/operators/arg_ops.cu.cc",
"caffe2/operators/asin_op.cu.cc",
"caffe2/operators/assert_op.cu.cc",
"caffe2/operators/atan_op.cu.cc",
"caffe2/operators/batch_gather_ops.cu.cc",
"caffe2/operators/batch_matmul_op.cu.cc",
"caffe2/operators/batch_moments_op.cu.cc",
"caffe2/operators/batch_permutation_op.cu.cc",
"caffe2/operators/batch_sparse_to_dense_op.cu.cc",
"caffe2/operators/boolean_mask_ops.cu.cc",
"caffe2/operators/boolean_unmask_ops.cu.cc",
"caffe2/operators/bucketize_op.cu.cc",
"caffe2/operators/cast_op.cu.cc",
"caffe2/operators/cbrt_op.cu.cc",
"caffe2/operators/ceil_op.cu.cc",
"caffe2/operators/channel_backprop_stats_op.cu.cc",
"caffe2/operators/channel_shuffle_op.cu.cc",
"caffe2/operators/channel_stats_op.cu.cc",
"caffe2/operators/channelwise_conv3d_op_cudnn.cu.cc",
"caffe2/operators/clip_op.cu.cc",
"caffe2/operators/copy_op.cu.cc",
"caffe2/operators/cos_op.cu.cc",
"caffe2/operators/cosh_op.cu.cc",
"caffe2/operators/cosine_embedding_criterion_op.cu.cc",
"caffe2/operators/cross_entropy_op.cu.cc",
"caffe2/operators/cube_op.cu.cc",
"caffe2/operators/data_couple_gpu.cu.cc",
"caffe2/operators/deform_conv_op.cu.cc",
"caffe2/operators/depthwise_3x3_conv_op_cudnn.cu.cc",
"caffe2/operators/distance_op.cu.cc",
"caffe2/operators/dropout_op.cu.cc",
"caffe2/operators/elementwise_div_op.cu.cc",
"caffe2/operators/elementwise_linear_op.cu.cc",
"caffe2/operators/elementwise_mul_op.cu.cc",
"caffe2/operators/elementwise_ops.cu.cc",
"caffe2/operators/elu_op.cu.cc",
"caffe2/operators/enforce_finite_op.cu.cc",
"caffe2/operators/ensure_cpu_output_op.cu.cc",
"caffe2/operators/erf_op.cu.cc",
"caffe2/operators/filler_op.cu.cc",
"caffe2/operators/find_op.cu.cc",
"caffe2/operators/floor_op.cu.cc",
"caffe2/operators/gather_op.cu.cc",
"caffe2/operators/gelu_op.cu.cc",
"caffe2/operators/generate_proposals_op.cu.cc",
"caffe2/operators/generate_proposals_op_util_nms_gpu.cu.cc",
"caffe2/operators/given_tensor_byte_string_to_uint8_fill_op.cu.cc",
"caffe2/operators/given_tensor_fill_op.cu.cc",
"caffe2/operators/glu_op.cu.cc",
"caffe2/operators/group_norm_op.cu.cc",
"caffe2/operators/gru_unit_op_gpu.cu.cc",
"caffe2/operators/half_float_ops.cu.cc",
"caffe2/operators/hard_sigmoid_op.cu.cc",
"caffe2/operators/instance_norm_op.cu.cc",
"caffe2/operators/integral_image_op.cu.cc",
"caffe2/operators/layer_norm_op.cu.cc",
"caffe2/operators/leaky_relu_op.cu.cc",
"caffe2/operators/lengths_pad_op.cu.cc",
"caffe2/operators/lengths_tile_op.cu.cc",
"caffe2/operators/local_response_normalization_op.cu.cc",
"caffe2/operators/logit_op.cu.cc",
"caffe2/operators/loss_op.cu.cc",
"caffe2/operators/lp_pool_op.cu.cc",
"caffe2/operators/lstm_unit_op_gpu.cu.cc",
"caffe2/operators/margin_ranking_criterion_op.cu.cc",
"caffe2/operators/max_pool_with_index.cu.cc",
"caffe2/operators/mean_op.cu.cc",
"caffe2/operators/mem_query_op.cu.cc",
"caffe2/operators/minmax_ops.cu.cc",
"caffe2/operators/moments_op.cu.cc",
"caffe2/operators/multi_class_accuracy_op.cu.cc",
"caffe2/operators/normalize_ops.cu.cc",
"caffe2/operators/one_hot_ops.cu.cc",
"caffe2/operators/pack_segments.cu.cc",
"caffe2/operators/pad_op_gpu.cu.cc",
"caffe2/operators/perplexity_op.cu.cc",
"caffe2/operators/piecewise_linear_transform_op.cu.cc",
"caffe2/operators/pool_op.cu.cc",
"caffe2/operators/pow_op.cu.cc",
"caffe2/operators/prelu_op.cu.cc",
"caffe2/operators/reciprocal_op.cu.cc",
"caffe2/operators/reduce_front_back_max_ops.cu.cc",
"caffe2/operators/reduce_front_back_sum_mean_ops.cu.cc",
"caffe2/operators/reduce_ops.cu.cc",
"caffe2/operators/reduction_ops.cu.cc",
"caffe2/operators/relu_n_op.cu.cc",
"caffe2/operators/relu_op.cu.cc",
"caffe2/operators/replace_nan_op.cu.cc",
"caffe2/operators/resize_3d_op.cu.cc",
"caffe2/operators/resize_op.cu.cc",
"caffe2/operators/reverse_packed_segs_op.cu.cc",
"caffe2/operators/rmac_regions_op.cu.cc",
"caffe2/operators/rnn/recurrent_network_op_gpu.cu.cc",
"caffe2/operators/roi_align_gradient_op.cu.cc",
"caffe2/operators/roi_align_op.cu.cc",
"caffe2/operators/roi_align_rotated_gradient_op.cu.cc",
"caffe2/operators/roi_align_rotated_op.cu.cc",
"caffe2/operators/roi_pool_op.cu.cc",
"caffe2/operators/rsqrt_op.cu.cc",
"caffe2/operators/scale_blobs_op.cu.cc",
"caffe2/operators/segment_reduction_op_gpu.cu.cc",
"caffe2/operators/selu_op.cu.cc",
"caffe2/operators/sequence_ops.cu.cc",
"caffe2/operators/sigmoid_op.cu.cc",
"caffe2/operators/sin_op.cu.cc",
"caffe2/operators/sinh_op.cu.cc",
"caffe2/operators/slice_op.cu.cc",
"caffe2/operators/softmax_ops.cu.cc",
"caffe2/operators/softplus_op.cu.cc",
"caffe2/operators/softsign_op.cu.cc",
"caffe2/operators/space_batch_op_gpu.cu.cc",
"caffe2/operators/sparse_normalize_op_gpu.cu.cc",
"caffe2/operators/sparse_to_dense_op.cu.cc",
"caffe2/operators/spatial_batch_norm_op.cu.cc",
"caffe2/operators/spatial_batch_norm_op_cudnn.cu.cc",
"caffe2/operators/stump_func_op.cu.cc",
"caffe2/operators/summarize_op.cu.cc",
"caffe2/operators/swish_op.cu.cc",
"caffe2/operators/tan_op.cu.cc",
"caffe2/operators/tanh_op.cu.cc",
"caffe2/operators/thresholded_relu_op.cu.cc",
"caffe2/operators/tile_op.cu.cc",
"caffe2/operators/top_k.cu.cc",
"caffe2/operators/transpose_op.cu.cc",
"caffe2/operators/unique_ops.cu.cc",
"caffe2/operators/upsample_op.cu.cc",
"caffe2/operators/utility_ops.cu.cc",
"caffe2/operators/weighted_sample_op.cu.cc",
"caffe2/sgd/adadelta_op_gpu.cu.cc",
"caffe2/sgd/adagrad_op_gpu.cu.cc",
"caffe2/sgd/adam_op_gpu.cu.cc",
"caffe2/sgd/fp16_momentum_sgd_op.cu.cc",
"caffe2/sgd/fp32_momentum_sgd_op.cu.cc",
"caffe2/sgd/lars_op_gpu.cu.cc",
"caffe2/sgd/momentum_sgd_op_gpu.cu.cc",
"caffe2/sgd/rmsprop_op_gpu.cu.cc",
"caffe2/sgd/yellowfin_op_gpu.cu.cc",
"caffe2/utils/math/broadcast.cu.cc",
"caffe2/utils/math/elementwise.cu.cc",
"caffe2/utils/math/reduce.cu.cc",
"caffe2/utils/math/transpose.cu.cc",
"caffe2/utils/math_gpu.cu.cc",
],
)
# To achieve finer granularity and make debug easier, caffe2 is split into three libraries:
# ATen, caffe2 and caffe2_for_aten_headers. ATen lib group up source codes under
# aten/ directory and caffe2 contains most files under `caffe2/` directory. Since the
# ATen lib and the caffe2 lib would depend on each other, `caffe2_for_aten_headers` is splitted
# out from `caffe2` to avoid dependency cycle.
cc_library(
name = "caffe2_for_aten_headers",
hdrs = [
"caffe2/core/common.h",
"caffe2/core/logging.h",
"caffe2/core/types.h",
"caffe2/perfkernels/common.h",
"caffe2/perfkernels/embedding_lookup.h",
"caffe2/perfkernels/embedding_lookup_idx.h",
"caffe2/utils/fixed_divisor.h",
"caffe2/utils/cpuid.h",
] + glob([
"caffe2/utils/threadpool/*.h",
"caffe2/proto/*.h",
]),
copts = CAFFE2_COPTS,
visibility = ["//visibility:public"],
deps = [
":c10_headers",
":caffe2_protos",
":caffe2_core_macros_h",
],
)
cc_library(
name = "caffe2_headers",
hdrs = glob([
"caffe2/contrib/aten/*.h",
"caffe2/contrib/gloo/*.h",
"caffe2/core/*.h",
"caffe2/core/nomnigraph/include/nomnigraph/Converters/*.h",
"caffe2/core/nomnigraph/include/nomnigraph/Generated/*.h",
"caffe2/core/nomnigraph/include/nomnigraph/Graph/*.h",
"caffe2/core/nomnigraph/include/nomnigraph/Representations/*.h",
"caffe2/core/nomnigraph/include/nomnigraph/Support/*.h",
"caffe2/core/nomnigraph/include/nomnigraph/Transformations/*.h",
"caffe2/core/nomnigraph/tests/*.h",
"caffe2/db/*.h",
"caffe2/distributed/*.h",
"caffe2/ideep/*.h",
"caffe2/ideep/operators/*.h",
"caffe2/ideep/operators/quantization/*.h",
"caffe2/ideep/utils/*.h",
"caffe2/onnx/*.h",
"caffe2/operators/*.h",
"caffe2/operators/rnn/*.h",
"caffe2/opt/*.h",
"caffe2/perfkernels/*.h",
"caffe2/predictor/*.h",
"caffe2/predictor/emulator/*.h",
"caffe2/proto/*.h",
"caffe2/quantization/server/*.h",
"caffe2/queue/*.h",
"caffe2/serialize/*.h",
"caffe2/sgd/*.h",
"caffe2/share/contrib/depthwise/*.h",
"caffe2/transforms/*.h",
"caffe2/utils/*.h",
"caffe2/utils/math/*.h",
"caffe2/utils/threadpool/*.h",
"modules/**/*.h",
]) + if_cuda(glob([
"caffe2/**/*.cuh",
"caffe2/image/*.h",
])),
copts = CAFFE2_COPTS,
includes = [
"caffe2/contrib/aten",
"caffe2/core/nomnigraph/include",
"third_party/miniz-2.0.8",
],
visibility = ["//visibility:public"],
deps = [
":caffe2_for_aten_headers",
":caffe2_protos",
],
)
cc_library(
name = "caffe2_dnnlowp_avx2_ops",
srcs = [
"caffe2/quantization/server/elementwise_sum_dnnlowp_op_avx2.cc",
"caffe2/quantization/server/fully_connected_fake_lowp_op_avx2.cc",
"caffe2/quantization/server/group_norm_dnnlowp_op_avx2.cc",
"caffe2/quantization/server/norm_minimization_avx2.cc",
"caffe2/quantization/server/pool_dnnlowp_op_avx2.cc",
"caffe2/quantization/server/relu_dnnlowp_op_avx2.cc",
"caffe2/quantization/server/spatial_batch_norm_dnnlowp_op_avx2.cc",
"caffe2/quantization/server/transpose.cc",
],
copts = CAFFE2_COPTS + [
"-mf16c",
"-mavx2",
"-mfma",
"-mxsave",
],
visibility = ["//visibility:public"],
deps = [
":caffe2_headers",
"@fbgemm",
],
alwayslink = True,
)
cc_library(
name = "caffe2",
srcs = [
"caffe2/db/create_db_op.cc",
"caffe2/db/protodb.cc",
"caffe2/share/contrib/depthwise/depthwise3x3_conv_op.cc",
":caffe2_contrib_srcs",
":caffe2_core_srcs",
":caffe2_distributed_srcs",
":caffe2_ideep_srcs",
":caffe2_onnx_srcs",
":caffe2_operators_srcs",
":caffe2_opt_srcs",
":caffe2_perfkernels_srcs",
":caffe2_predictor_srcs",
":caffe2_quantization_srcs",
":caffe2_queue_srcs",
":caffe2_serialize_srcs",
":caffe2_sgd_srcs",
":caffe2_transforms_srcs",
":caffe2_utils_srcs",
],
copts = CAFFE2_COPTS + ["-mf16c"],
linkstatic = 1,
visibility = ["//visibility:public"],
deps = [
":caffe2_headers",
":caffe2_dnnlowp_avx2_ops",
":caffe2_perfkernels_avx",
":caffe2_perfkernels_avx2",
":caffe2_perfkernels_avx512",
":caffe2_protos",
"//third_party/miniz-2.0.8:miniz",
"@com_google_protobuf//:protobuf",
"@eigen",
"@fbgemm//:fbgemm_src_headers",
"@foxi",
"@gloo",
"@onnx",
"@fmt",
] + if_cuda(
[
":caffe2_cpp_cuda",
":aten_cuda",
"@tensorpipe//:tensorpipe_cuda",
],
[
":aten",
"@tensorpipe//:tensorpipe_cpu",
],
),
alwayslink = True,
)
cc_library(
name = "caffe2_cpp_cuda",
srcs = [":caffe2_cuda_srcs"],
copts = CAFFE2_COPTS,
visibility = ["//visibility:public"],
deps = [
":caffe2_cuda",
":caffe2_headers",
],
alwayslink = True,
)
cu_library(
name = "caffe2_cuda",
srcs = [":caffe2_cu_srcs"],
copts = CAFFE2_COPTS + torch_cuda_half_options,
visibility = ["//visibility:public"],
deps = [
":aten",
":caffe2_headers",
"@cub",
"@cuda//:cublas",
"@cuda//:curand",
"@cudnn",
"@eigen",
"@gloo",
"@tensorpipe//:tensorpipe_cuda",
],
alwayslink = True,
)
PERF_COPTS = [
"-DHAVE_GCC_GET_CPUID",
"-DUSE_AVX",
"-DUSE_AVX2",
"-DTH_HAVE_THREAD",
"-DHAVE_AVX_CPU_DEFINITION",
"-DHAVE_AVX2_CPU_DEFINITION",
"-DENABLE_ALIAS=1",
"-DHAVE_MALLOC_USABLE_SIZE=1",
"-DHAVE_MMAP=1",
"-DHAVE_SHM_OPEN=1",
"-DHAVE_SHM_UNLINK=1",
"-DSLEEF_STATIC_LIBS=1",
"-D_FILE_OFFSET_BITS=64",
"-DUSE_FBGEMM",
"-fvisibility-inlines-hidden",
"-Wunused-parameter",
"-fno-math-errno",
"-fno-trapping-math",
"-mf16c",
]
PERF_HEADERS = glob([
"caffe2/perfkernels/*.h",
"caffe2/core/*.h",
])
cc_library(
name = "caffe2_perfkernels_avx",
srcs = glob([
"caffe2/perfkernels/*_avx.cc",
]),
hdrs = PERF_HEADERS,
copts = PERF_COPTS + [
"-mavx",
],
visibility = ["//visibility:public"],
deps = [
":caffe2_headers",
":c10",
],
alwayslink = True,
)
cc_library(
name = "caffe2_perfkernels_avx2",
srcs = glob([
"caffe2/perfkernels/*_avx2.cc",
]),
hdrs = PERF_HEADERS,
copts = PERF_COPTS + [
"-mavx2",
"-mfma",
"-mavx",
],
visibility = ["//visibility:public"],
deps = [
":caffe2_headers",
":c10",
],
alwayslink = True,
)
cc_library(
name = "caffe2_perfkernels_avx512",
srcs = [
"caffe2/perfkernels/common_avx512.cc",
],
hdrs = PERF_HEADERS,
copts = PERF_COPTS + [
"-mavx512f",
"-mavx512dq",
"-mavx512vl",
"-mavx2",
"-mfma",
"-mavx",
],
visibility = ["//visibility:public"],
deps = [
":caffe2_headers",
":c10",
],
alwayslink = True,
)
# torch
py_binary(
name = "gen_version_header",
srcs = ["tools/setup_helpers/gen_version_header.py"],
)
genrule(
name = "version_h",
srcs = ["torch/csrc/api/include/torch/version.h.in", "version.txt"],
outs = ["torch/csrc/api/include/torch/version.h"],
cmd = "$(location :gen_version_header) --template-path $(location torch/csrc/api/include/torch/version.h.in) --version-path $(location version.txt) --output-path $@",
tools = [':gen_version_header']
)
torch_cuda_headers = glob(["torch/csrc/cuda/*.h"])
cc_library(
name = "torch_headers",
hdrs = if_cuda(
torch_cuda_headers,
) + glob(
[
"torch/*.h",
"torch/csrc/**/*.h",
"torch/csrc/distributed/c10d/*.hpp",
"torch/lib/libshm/*.h",
],
exclude = [
"torch/csrc/autograd/generated/VariableType.h",
"torch/csrc/autograd/generated/RegistrationDeclarations.h",
"torch/csrc/autograd/generated/variable_factories.h",
"torch/csrc/autograd/generated/Functions.h",
] + torch_cuda_headers,
) + [":cpp_generated_code", ":version_h"],
includes = [
"torch/csrc",
"torch/csrc/api/include",
"torch/csrc/distributed",
"torch/lib",
"torch/lib/libshm",
],
visibility = ["//visibility:public"],
deps = [
":aten_headers",
":c10_headers",
":caffe2_headers",
"@local_config_python//:python_headers",
"@onnx",
],
alwayslink = True,
)
TORCH_COPTS = COMMON_COPTS + [
"-Dtorch_EXPORTS",
"-DHAVE_AVX_CPU_DEFINITION",
"-DHAVE_AVX2_CPU_DEFINITION",
"-DCAFFE2_USE_GLOO",
"-fvisibility-inlines-hidden",
"-fno-math-errno ",
"-fno-trapping-math",
]
cc_library(
name = "torch",
srcs = if_cuda(glob(
[
"torch/csrc/cuda/*.cpp",
"torch/csrc/autograd/functions/comm.cpp",
],
exclude = [
"torch/csrc/cuda/python_nccl.cpp",
"torch/csrc/cuda/nccl.cpp",
],
)) + libtorch_core_sources + libtorch_distributed_sources + torch_cpp_srcs + libtorch_extra_sources + jit_core_sources + [
":cpp_generated_code",
],
copts = TORCH_COPTS + if_cuda(["-DUSE_CUDA=1"]),
defines = [
"CAFFE2_NIGHTLY_VERSION=20200115",
],
visibility = ["//visibility:public"],
deps = [
":caffe2",
":torch_headers",
],
alwayslink = True,
)
cc_library(
name = "shm",
srcs = glob(["torch/lib/libshm/*.cpp"]),
deps = [
":torch",
],
)
cc_library(
name = "libtorch_headers",
hdrs = glob([
"**/*.h",
"**/*.cuh",
]) + [
":generated_code",
],
includes = [
".",
"torch/csrc/api/include",
"torch/csrc/distributed",
"torch/lib",
"torch/lib/libshm",
],
visibility = ["//visibility:public"],
deps = [
":aten_headers",
":c10_headers",
":caffe2_headers",
],
)
cc_library(
name = "torch_python",
srcs = libtorch_python_core_sources + [":python_generated_code"],
hdrs = glob([
"torch/csrc/generic/*.cpp",
]),
deps = [
":torch",
":shm",
],
)
pybind_extension(
name = "_C",
srcs = ["torch/csrc/stub.c"],
deps = [
":torch_python"
],
)
# cpp api tests
cc_library(
name = "test_support",
testonly = True,
srcs = [
"test/cpp/api/support.cpp",
],
hdrs = [
"test/cpp/api/init_baseline.h",
"test/cpp/api/optim_baseline.h",
"test/cpp/api/support.h",
"test/cpp/common/support.h",
],
deps = [
":torch",
"@com_google_googletest//:gtest_main",
],
)
# Torch integration tests rely on a labeled data set from the MNIST database.
# http://yann.lecun.com/exdb/mnist/
cpp_api_tests = glob(["test/cpp/api/*.cpp"])
[
cc_test(
name = paths.split_extension(paths.basename(filename))[0].replace("-","_") + "_test",
size = "medium",
srcs = [filename],
deps = [
":test_support",
"@com_google_googletest//:gtest_main",
],
) for filename in cpp_api_tests
]
test_suite(
name = "api_tests",
tests = [
"any_test",
"autograd_test",
"dataloader_test",
"enum_test",
"expanding_array_test",
"functional_test",
"init_test",
"integration_test",
"jit_test",
"memory_test",
"misc_test",
"module_test",
"modulelist_test",
"modules_test",
"nn_utils_test",
"optim_test",
"ordered_dict_test",
"rnn_test",
"sequential_test",
"serialize_test",
"static_test",
"tensor_options_test",
"tensor_test",
"torch_include_test",
],
)
# dist autograd tests
cc_test(
name = "torch_dist_autograd_test",
size = "small",
srcs = ["test/cpp/dist_autograd/test_dist_autograd.cpp"],
tags = [
"exclusive",
"gpu-required",
],
deps = [
":torch",
"@com_google_googletest//:gtest_main",
],
)
# jit tests
# Because these individual unit tests require custom registering,
# it is easier to mimic the cmake build by globing together a single test.
cc_test(
name = "jit_tests",
size = "small",
srcs = glob([
"test/cpp/jit/*.cpp",
"test/cpp/jit/*.h",
"test/cpp/tensorexpr/*.cpp",
"test/cpp/tensorexpr/*.h",
]),
linkstatic = True,
tags = [
"exclusive",
"gpu-required",
],
deps = [
":torch",
"@com_google_googletest//:gtest_main",
],
)
cc_test(
name = "lazy_tests",
size = "small",
srcs = glob([
"test/cpp/lazy/*.cpp",
"test/cpp/lazy/*.h",
]),
linkstatic = True,
tags = [
"exclusive",
],
deps = [
":torch",
"@com_google_googletest//:gtest_main",
],
)
# all tests
test_suite(
name = "all_tests",
tests = [
"api_tests",
"c10_tests",
"jit_tests",
"torch_dist_autograd_test",
],
)