2020-05-19 05:00:55 +00:00
|
|
|
load("@bazel_skylib//lib:paths.bzl", "paths")
|
2020-05-20 05:49:28 +00:00
|
|
|
load("@pybind11_bazel//:build_defs.bzl", "pybind_extension")
|
2020-04-07 05:48:33 +00:00
|
|
|
load("@rules_proto//proto:defs.bzl", "proto_library")
|
|
|
|
|
load("@rules_cc//cc:defs.bzl", "cc_binary", "cc_library", "cc_proto_library", "cc_test")
|
2020-08-27 19:09:34 +00:00
|
|
|
load("//third_party:substitution.bzl", "header_template_rule")
|
2020-06-28 04:58:27 +00:00
|
|
|
load("//:tools/build_variables.bzl", "torch_cpp_srcs", "libtorch_python_core_sources", "libtorch_core_sources", "libtorch_distributed_sources", "libtorch_extra_sources", "jit_core_sources")
|
2020-04-07 05:48:33 +00:00
|
|
|
load("//tools/rules:cu.bzl", "cu_library")
|
|
|
|
|
load("//tools/config:defs.bzl", "if_cuda")
|
|
|
|
|
load("//:aten.bzl", "intern_build_aten_ops")
|
|
|
|
|
|
|
|
|
|
COMMON_COPTS = [
|
|
|
|
|
"-DHAVE_MALLOC_USABLE_SIZE=1",
|
|
|
|
|
"-DHAVE_MMAP=1",
|
|
|
|
|
"-DHAVE_SHM_OPEN=1",
|
|
|
|
|
"-DHAVE_SHM_UNLINK=1",
|
|
|
|
|
"-D_FILE_OFFSET_BITS=64",
|
|
|
|
|
"-DHAVE_GCC_GET_CPUID",
|
|
|
|
|
"-DUSE_GCC_GET_CPUID",
|
|
|
|
|
"-DTH_HAVE_THREAD",
|
|
|
|
|
"-DUSE_FBGEMM",
|
|
|
|
|
"-DUSE_DISTRIBUTED",
|
|
|
|
|
"-DATEN_THREADING=NATIVE",
|
|
|
|
|
"-DNO_CUDNN_DESTROY_HANDLE",
|
|
|
|
|
] + if_cuda([
|
|
|
|
|
"-DUSE_CUDA",
|
|
|
|
|
"-DUSE_CUDNN",
|
|
|
|
|
])
|
|
|
|
|
|
|
|
|
|
# c10
|
2020-08-27 19:09:34 +00:00
|
|
|
header_template_rule(
|
2020-04-07 05:48:33 +00:00
|
|
|
name = "cmake_macros_h",
|
|
|
|
|
src = "c10/macros/cmake_macros.h.in",
|
|
|
|
|
out = "c10/macros/cmake_macros.h",
|
|
|
|
|
substitutions = {
|
|
|
|
|
"cmakedefine": "define",
|
|
|
|
|
"#define C10_USE_NUMA": "/* #undef C10_USE_NUMA */",
|
|
|
|
|
},
|
|
|
|
|
)
|
|
|
|
|
|
2020-08-27 19:09:34 +00:00
|
|
|
header_template_rule(
|
2020-04-07 05:48:33 +00:00
|
|
|
name = "cuda_cmake_macros_h",
|
|
|
|
|
src = "c10/cuda/impl/cuda_cmake_macros.h.in",
|
|
|
|
|
out = "c10/cuda/impl/cuda_cmake_macros.h",
|
|
|
|
|
substitutions = {
|
|
|
|
|
"cmakedefine": "define",
|
|
|
|
|
},
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
cc_library(
|
|
|
|
|
name = "c10_headers",
|
|
|
|
|
hdrs = glob([
|
|
|
|
|
"c10/core/*.h",
|
|
|
|
|
"c10/core/impl/*.h",
|
|
|
|
|
"c10/cuda/*.h",
|
|
|
|
|
"c10/cuda/impl/*.h",
|
|
|
|
|
"c10/macros/*.h",
|
2020-09-29 18:31:16 +00:00
|
|
|
"c10/mobile/*.h",
|
2020-04-07 05:48:33 +00:00
|
|
|
"c10/util/*.h",
|
2020-04-24 19:04:53 +00:00
|
|
|
"c10/util/*.hpp",
|
2020-08-27 19:09:34 +00:00
|
|
|
]),
|
2020-04-07 05:48:33 +00:00
|
|
|
deps = [
|
|
|
|
|
"@com_github_gflags_gflags//:gflags",
|
|
|
|
|
"@com_github_glog//:glog",
|
2020-08-27 19:09:34 +00:00
|
|
|
":cmake_macros_h",
|
|
|
|
|
":cuda_cmake_macros_h",
|
2020-04-07 05:48:33 +00:00
|
|
|
],
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
cc_library(
|
|
|
|
|
name = "c10",
|
|
|
|
|
srcs = glob([
|
|
|
|
|
"c10/core/*.cpp",
|
|
|
|
|
"c10/core/impl/*.cpp",
|
2020-09-29 18:31:16 +00:00
|
|
|
"c10/mobile/*.cpp",
|
2020-04-07 05:48:33 +00:00
|
|
|
"c10/util/*.cpp",
|
|
|
|
|
]) + if_cuda(
|
|
|
|
|
glob([
|
|
|
|
|
"c10/cuda/*.cpp",
|
|
|
|
|
"c10/cuda/impl/*.cpp",
|
|
|
|
|
]),
|
|
|
|
|
[],
|
|
|
|
|
),
|
|
|
|
|
copts = ["-DCAFFE2_BUILD_MAIN_LIB"],
|
|
|
|
|
deps = [
|
|
|
|
|
":c10_headers",
|
2021-05-12 22:50:18 +00:00
|
|
|
"@fmt",
|
2020-04-07 05:48:33 +00:00
|
|
|
] + if_cuda(
|
|
|
|
|
["@cuda"],
|
|
|
|
|
[],
|
|
|
|
|
),
|
|
|
|
|
alwayslink = True,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
cc_test(
|
|
|
|
|
name = "c10_tests",
|
|
|
|
|
size = "small",
|
|
|
|
|
srcs = glob([
|
|
|
|
|
"c10/test/util/*.cpp",
|
|
|
|
|
"c10/test/util/*.h",
|
|
|
|
|
"c10/test/core/*.cpp",
|
|
|
|
|
"c10/test/core/impl/*.cpp",
|
|
|
|
|
]),
|
|
|
|
|
copts = ["-Wno-deprecated-declarations"],
|
|
|
|
|
deps = [
|
|
|
|
|
":c10",
|
|
|
|
|
":c10_headers",
|
|
|
|
|
"@com_google_googletest//:gtest_main",
|
|
|
|
|
],
|
|
|
|
|
)
|
|
|
|
|
|
2020-08-31 15:58:32 +00:00
|
|
|
# TODO: refactor this into its own library (but how to make
|
|
|
|
|
# a binary based off of a module in a library?)
|
2020-04-07 05:48:33 +00:00
|
|
|
py_binary(
|
|
|
|
|
name = "gen",
|
2020-08-31 15:58:32 +00:00
|
|
|
srcs = ["tools/setup_helpers/gen.py"],
|
|
|
|
|
deps = [
|
|
|
|
|
":tools_codegen"
|
|
|
|
|
],
|
2020-04-07 05:48:33 +00:00
|
|
|
)
|
|
|
|
|
|
|
|
|
|
genrule(
|
|
|
|
|
name = "generated_cpp",
|
|
|
|
|
srcs = [
|
|
|
|
|
"aten/src/ATen/native/native_functions.yaml",
|
|
|
|
|
] + glob(["aten/src/ATen/templates/**"]),
|
|
|
|
|
outs = [
|
|
|
|
|
"aten/src/ATen/Declarations.yaml",
|
2020-11-12 17:51:21 +00:00
|
|
|
"aten/src/ATen/RegisterBackendSelect.cpp",
|
|
|
|
|
"aten/src/ATen/RegisterCPU.cpp",
|
Add a Functionalization pass in core (#64432)
Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/64432
Original PR description + feedback here: https://github.com/pytorch/pytorch/pull/63048
I've addressed all of the feedback in the original PR and made some pretty large changes, listed below.
**Table of Contents**
- Starting points
- List of the main changes from the original PR
- Next Steps
- Example codegen output (for a view, mutation, and view+mutation op)
**Starting Points**
A good place to start when looking through the PR:
* Alban mentioned that this is a useful mental model (thanks Ed for originally making this clear to me). Semantically, the pass currently does THREE things, which are all needed by functorch - all fused together into one big pass.
* (a) alias removal, which replaces {view} calls with {view}_copy calls, and manually tracks aliasing information, so that when one tensor is mutated, we re-apply the same mutation to all of the aliases. This is the bulk of the work - once this is done, the next 2 things are trivial to implement.
* (b) mutation removal, which is easy to do once we know that there are no aliases. Every mutation `a.add_(b)` becomes `a.replace_(a.add(b))`
* (c) reapplying views: all of the `{view}_copy` calls are replaced with `{view}` calls again. This is an optimization that we can make specifically for functorch (and strided backends), that only care about mutation removal and not alias removal
* XLA and Vulkan only want (a), or (a) + (b). Later, we'll want to split this out so that you can actually opt into different versions of this logic.
* There is currently no {view}_copy replacement, because the pass just <replace views with copies> and <replace copies with views> steps have been combined. Later, we'll want to actually implement {view}_copy variants of each view operator, probably with codegen.
* documentation breadcrumb 1, in `FunctionalTensorWrapper.cpp`: https://github.com/pytorch/pytorch/pull/64432/files#diff-a0bac99bf205dba5b94cb64fc2466d3d55d991887572f9cd6a02e27b3a91dd60R59 (you might have to expand the `FunctionalTensorWrapper.cpp` file, which GitHub closes by default because it's large)
* documentation breadcrumb 2, in `FunctionalTensorWrapper.h`: https://github.com/pytorch/pytorch/pull/64432/files#diff-c945c71a4ccac65871f24a912e8904f9a5088b24a32e636727ea9c8fe920708aR12
* Reading through the codegen output at the bottom of this description.
**Main changes from the original PR**
(1) I use lambdas instead of a giant enum to handle all of the different views.
This results in less boilerplate per view op (and more stuff that can be codegen'd). Every `ViewMeta` object now contains a `forward` and `reverse` lambda, that knows how to replay the view and its inverse. This makes the actual code that executes the replaying logic a lot less boilerplate-y (see `Alias::sync_update_operations` and `FunctionalTensorWrapper::sync_`)
(2) Every tensor during the functionalization pass is always wrapped in a `FunctionalTensorWrapper`.
This is potentially unnecessary for Vulkan/XLA, and will have a mild perf impact, but for now this PR just targets the functorch use case. I previously had a complicated design a (`FunctionalTensorImplBase` class) to avoid needing the wrapper for XLA, but it had some subtleties that are gonna require more thought to fix, so I'm pushing that off for now.
(3) `FunctionalTensorWrapper` objects accurately report stride information.
It's a little annoying to do this though, because the logic that calculates stride info for each view isn't easily separated from the actual view kernels in core, `at::native::{view}`. I do this by adding logic in each `at::functionalization::{view}` kernel to call the reference implementation `at::native::{view}`. I don't do anything with the output aside from taking it's size/stride/storage_offset to set the actual output tensor's size/stride/storage_offset correctly. There's another annoying part to this: I'm pretty sure that we want to pass in the actual *wrapper* tensors directly into the native kernels, not their inner unwrapped values. But there are some `at::native::{view}` kernels that call other tensor methods, which re-invokes the dispatcher, calling functionalization/functorch kernels that try do the unwrapping.
To do this, right now I have an `AutoDispatchDirectlyToNative` guard that basically ensures that any tensor methods called inside of the at::native::{view} op always redispatch straight to the CPU kernel (which will be another at::native:: kernel). This feels kind of heavy handed, but I'm not sure of a better way to do it.
(4) `FunctionalTensorWrapper` objects accurately report aliasing information.
There's a new `FunctionalStorageImpl` class (subclass of `StorageImpl`) that allows tensors in the functionalization pass to accurately alias storage. If two tensors `a` and `b` in a functionalized program are views of one another, then `a.storage.is_alias_of(b.storage)` should return true. I added this in a pretty similar way to how meta tensors allocate storage, although I don't pass in an actual allocator (I think this is fine because you should never resize a functional tensor's storage).
One thing I'm not sure about - should `FunctionalTensorWrapper` set `storage_access_should_throw_`: (a) always, (b) never, (c) only if its wrapped tensor has it set.
Right now I have it not set, mostly because calling the reference view functions (`at::native::{view}`) requires looking at the storage. But that means that if you try to access storage from python in a functionalized program, you'll get silent garbage instead of an error. Related question: are we planning on exposing meta tensor storage to python in the future (even though it contains garbage)?
(5) better docs :)
**View operator coverage**
(6) The functionalization pass now gets math-composite view ops for free.
I didn't add the `Functionalize` dispatch key to the composite set, because I don't want composite ops like `torch.ones` to get decomposed before hitting the functionalization pass. Instead, I added codegen to manually register the `at::native::` kernels of composite view ops. This is a little hairy, because the names of the `at::native::` kernels aren't easily accessible. They're stored in a `Dict[DispatchKey, BackendIndex]`. I made a best-effort attempt to get each view kernel's name, basically by assuming that every view op has either a composite or cpu implementation.
There's also a hardcoded list of composite view ops in `gen_inplace_or_view_type.py`, but it looks like it's wrong. This is probably worth rationalizing later, but instead I created a new list of the "complete" set of composite view ops, and preserved the old set by hardcoding the delta between the two sets.
(7) I've added codegen for ops that are both views AND mutations, like `transpose_()` (why do we even have these {emoji:1f622}).
From some light testing, it looks like they work correctly with one caveat: I had a hard time ensuring that functorch programs that mutate their inputs using ops like `transpose_()` preserve the input mutations after the program finishes running. For (in my corresponding functorch branch) I emit a warning when this happens, and just don't preserve the mutation
(8) I added `{view}_inverse` implementations for every view op, in `FunctionalInverses.cpp`.
These are needed to take mutations made to views and replay them back onto the base. To reduce boilerplate, the codegen generates function declarations for each `{view}_inverse` function, so you get a nice compiler error when someone eventually adds a new view op.
The only view ops currently not supported are (a) as_strided, and (b) the sparse view ops (values()/indices()).
I can add support for as_strided, but it needs an `as_strided_inverse()` function. That will look really similar to the `as_strided_backward()` function in FunctionsManual.cpp, but it has some noticeable differences: we basically want an `as_strided_embed` for autograd and `as_strided_scatter` for functionalization. We also will probably need them to be primitives w.r.t to autograd, since the currently implementation for autograd uses view().copy_() calls that XLA won't be able to handle. I'm wondering if anyone has any objections, but otherwise I can make those change (which will require writing backward formulas for `as_strided_embed` and `as_strided_scatter`).
I did a bunch of manual testing that all looks pretty good, but it's definitely not fully tested. Ed pointed out that once XLA uses this pass (or at least once there's a POC), we can just run the existing xla view test suite. Hopefully that delay is okay - if it's not, maybe we can think about using OpInfos similar to how functorch uses them for testing.
Note: there's some duplication with autograd's view code. Every `{view}_inverse` implementation is really similar to the implementation for that view listed in `derivatives.yaml`. There are some major differences though:
* the autograd implementations over those backwards functions (like `permute_backwards()`, in `FunctionsManual.cpp`) internally call other view ops. For functoinalization, we want them to (eventually call `{view}_copy` operators).
* For view ops that take a subset of the original storage, like `slice/select/diagonal/as_strided()`, the autograd backward functions fill the "spaces" in the inverse call with zeroes. For functionalizations, we want to fill them with the value of `base` at those positions. It looks like this currently applies to 6 total ops (since we can ignore composites):
* select
* slice
* diagonal
* as_stridied
* split
* split_with_sizes
A nice end state would probably be for the autograd + functoinalization codegen to both look at the same yaml (either `derivatives.yaml`, or something else), and automatically generate the right thing. I didn't leave that in scope for this PR though.
**Current State + Next Steps**
There are a bunch of followups after this PR eventually lands. Roughly in order:
* Use the current pass to register problematic composite ops in functorch. Also, nested `functionalize()` calls aren't supported yet (I mostly just need to remove some debug asserts and test it).
* Work on freeing up dispatch key space in the by deduplicating the `{backend}`/`Autograd{backend}`/`Sparse{backend}`/`Quantized{backend}` keys
* Once we have more dispatch keys, split up this pass into 3 pieces - it's currently fused, and doesn't do the right thing for vulkan/XLA. Specifically, all of the `{view}` calls in the current pass's view-replay logic should turn into `{view}_copy` calls that vulkan/XLA know how to implement, and there will be separate passes for (a) removing mutations, and (b) turning `{view}_copy` calls back into `{view}` calls. For Vulkan, we eventually want a pass that ONLY removes aliasing and view calls, and doesn't remove mutations. We can also probably make the 2 new passes user dispatch keys to save dispatch key space, if they'll only be used by functorch anyway.
* Do more of a dive on perf for the vulkan/xla use cases. There are several areas to improve perf with varying levels of effort required. The simplest one that I'll probably do regardless is to codegen the out-of-place kernels instead of using a boxed fallback. Getting a POC working for xla will also be useful to test the view operator coverage.
**Example Codegen Output**
View Op:
```
::std::vector<at::Tensor> split_Tensor(c10::DispatchKeySet ks, const at::Tensor & self, int64_t split_size, int64_t dim) {
auto self_ = at::functionalization::impl::unwrapFunctionalTensor(self);
::std::vector<at::Tensor> out;
{
at::AutoDispatchBelowFunctionalize guard;
auto tmp_output = at::redispatch::split(ks & c10::after_func_keyset, self_, split_size, dim);
out = at::functionalization::impl::wrapFunctionalTensor(tmp_output);
// I'm fusing the [alias removal], [mutation removal], [add views back] passes together.
// Later, we'll want to turn them into separate passes (since e.g. vulkan only cares about alias removal).
}
at::functionalization::ViewMeta view_meta = at::functionalization::ViewMeta(
[split_size, dim](const at::Tensor& base, int64_t mutated_view_idx) -> at::Tensor {
return base.split(split_size, dim)[mutated_view_idx];
},
[split_size, dim](const at::Tensor& base, const at::Tensor& mutated_view, int64_t mutated_view_idx) -> at::Tensor {
return at::functionalization::impl::split_inverse(base, mutated_view, mutated_view_idx, split_size, dim);
}
);
at::functionalization::impl::set_view_meta(out, self, view_meta);
at::AutoDispatchDirectlyToNative native_guard;
::std::vector<at::Tensor> reference_tensor_output = at::native::split(self, split_size, dim);
at::functionalization::impl::set_strides(out, reference_tensor_output);
return out;
}
```
Mutation Op:
```
at::Tensor & add__Tensor(c10::DispatchKeySet ks, at::Tensor & self, const at::Tensor & other, const at::Scalar & alpha) {
at::functionalization::impl::sync(self);
at::functionalization::impl::sync(other);
auto self_ = at::functionalization::impl::unwrapFunctionalTensor(self);
auto other_ = at::functionalization::impl::unwrapFunctionalTensor(other);
at::Tensor tmp_output;
{
at::AutoDispatchBelowFunctionalize guard;
// The functionalization pass explicitly doesn't pass out= parameters to the redispatch
tmp_output = at::redispatch::add(
ks & c10::after_func_keyset, self_, other_, alpha);
}
self.replace_(tmp_output);
at::functionalization::impl::maybe_add_update(self);
return self;
}
```
View + Mutation Op:
```
at::Tensor & transpose_(c10::DispatchKeySet ks, at::Tensor & self, int64_t dim0, int64_t dim1) {
at::functionalization::ViewMeta view_meta = at::functionalization::ViewMeta(
[dim0, dim1](const at::Tensor& base, int64_t mutated_view_idx) -> at::Tensor {
return base.transpose(dim0, dim1);
},
[dim0, dim1](const at::Tensor& base, const at::Tensor& mutated_view, int64_t mutated_view_idx) -> at::Tensor {
return at::functionalization::impl::transpose_inverse(base, mutated_view, dim0, dim1);
}
);
at::functionalization::impl::mutate_view_meta(self, view_meta);
// See Note [Propagating strides in the functionalization pass]
// Directly update the sizes/strides/storage_offset fields on self using the inplace call.
// I need the guard because I don't want the at::native kernel to end up calling more functionalization/functorch kernels.
// Its only job is to directly compute the output size/stride/storage_offset metadata.
at::AutoDispatchDirectlyToNative native_guard;
at::native::transpose_(self, dim0, dim1);
return self;
}
```
Test Plan: Imported from OSS
Reviewed By: albanD
Differential Revision: D31942093
Pulled By: bdhirsh
fbshipit-source-id: b95598dae35dd1842fa8b1d8d1448332f3afaadf
2021-10-28 17:43:11 +00:00
|
|
|
"aten/src/ATen/RegisterFunctionalization_0.cpp",
|
|
|
|
|
"aten/src/ATen/RegisterFunctionalization_1.cpp",
|
|
|
|
|
"aten/src/ATen/RegisterFunctionalization_2.cpp",
|
|
|
|
|
"aten/src/ATen/RegisterFunctionalization_3.cpp",
|
|
|
|
|
# "aten/src/ATen/RegisterFunctionalizationEverything.cpp",
|
2020-11-12 17:51:21 +00:00
|
|
|
"aten/src/ATen/RegisterMkldnnCPU.cpp",
|
|
|
|
|
"aten/src/ATen/RegisterQuantizedCPU.cpp",
|
|
|
|
|
"aten/src/ATen/RegisterSparseCPU.cpp",
|
2021-04-12 17:07:56 +00:00
|
|
|
"aten/src/ATen/RegisterSparseCsrCPU.cpp",
|
2021-12-08 18:34:08 +00:00
|
|
|
"aten/src/ATen/RegisterZeroTensor.cpp",
|
2021-03-24 20:47:50 +00:00
|
|
|
"aten/src/ATen/RegisterCompositeImplicitAutograd.cpp",
|
2021-03-26 17:50:06 +00:00
|
|
|
"aten/src/ATen/RegisterCompositeExplicitAutograd.cpp",
|
bugfix: ensure that at::{dispatch_key}:: API gets external linkage (#58569)
Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/58569
This should allow external C++ files that aren't compiled into `libtorch.so`/`libtorch_cpu.so` (including all of fbcode) to use fast path functions like `at::cpu::add()`, which skip the dispatcher.
So, after spending way too much time trying to figure out why I was getting linker errors when calling `at::meta::{op}` and `at::cpu::{op}` from C++ test files, I realized that we're not including the header files for C++ for the namespaced operator definitions. I.e. `RegisterCPU.cpp`, which provides definitions for the `at::cpu::{op}` fast path functions, wasn't including the `CPUFunctions.h` header.
Why that breaks stuff: the `CPUFunctions.h` header file is what marks each function with the `TORCH_API` macro, so without including it, when we build `libtorch.so` and `libtorch_cpu.so`, the compiler will look at the definition in `RegisterCPU.cpp`, not see a `TORCH_API`, and decide that the function should get internal linkage.
An alternative would be to directly mark the function definitions in `RegisterCPU.cpp` with `TORCH_API`, but this seemed cleaner.
Test Plan: Imported from OSS
Reviewed By: pbelevich
Differential Revision: D28711300
Pulled By: bdhirsh
fbshipit-source-id: 535f245c20e977ff566d6da0757b3cefa137040b
2021-06-15 23:51:52 +00:00
|
|
|
"aten/src/ATen/RegisterMeta.cpp",
|
2020-11-12 17:51:21 +00:00
|
|
|
"aten/src/ATen/RegisterSchema.cpp",
|
Add at::cpu namespace of functions for structured kernels (#49505)
Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/49505
I have a problem which is that static runtime needs a way to bypass
dispatch and call into kernels directly. Previously, it used
native:: bindings to do this; but these bindings no longer exist
for structured kernels! Enter at::cpu: a namespace of exactly
at:: compatible functions that assume all of their arguments are
CPU and non-autograd! The header looks like this:
```
namespace at {
namespace cpu {
CAFFE2_API Tensor & add_out(Tensor & out, const Tensor & self, const Tensor & other, Scalar alpha=1);
CAFFE2_API Tensor add(const Tensor & self, const Tensor & other, Scalar alpha=1);
CAFFE2_API Tensor & add_(Tensor & self, const Tensor & other, Scalar alpha=1);
CAFFE2_API Tensor & upsample_nearest1d_out(Tensor & out, const Tensor & self, IntArrayRef output_size, c10::optional<double> scales=c10::nullopt);
CAFFE2_API Tensor upsample_nearest1d(const Tensor & self, IntArrayRef output_size, c10::optional<double> scales=c10::nullopt);
CAFFE2_API Tensor & upsample_nearest1d_backward_out(Tensor & grad_input, const Tensor & grad_output, IntArrayRef output_size, IntArrayRef input_size, c10::optional<double> scales=c10::nullopt);
CAFFE2_API Tensor upsample_nearest1d_backward(const Tensor & grad_output, IntArrayRef output_size, IntArrayRef input_size, c10::optional<double> scales=c10::nullopt);
}}
```
This slows down static runtime because these are not the "allow
resize of nonzero tensor" variant binding (unlike the ones I had manually
written). We can restore this: it's a matter of adding codegen smarts to
do this, but I haven't done it just yet since it's marginally more
complicated.
In principle, non-structured kernels could get this treatment too.
But, like an evil mastermind, I'm withholding it from this patch, as an extra
carrot to get people to migrate to structured muahahahaha.
Signed-off-by: Edward Z. Yang <ezyang@fb.com>
Test Plan: Imported from OSS
Reviewed By: smessmer
Differential Revision: D25616105
Pulled By: ezyang
fbshipit-source-id: 84955ae09d0b373ca1ed05e0e4e0074a18d1a0b5
2021-01-22 21:09:34 +00:00
|
|
|
"aten/src/ATen/CPUFunctions.h",
|
2021-06-25 01:07:33 +00:00
|
|
|
"aten/src/ATen/CPUFunctions_inl.h",
|
Add at::cpu namespace of functions for structured kernels (#49505)
Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/49505
I have a problem which is that static runtime needs a way to bypass
dispatch and call into kernels directly. Previously, it used
native:: bindings to do this; but these bindings no longer exist
for structured kernels! Enter at::cpu: a namespace of exactly
at:: compatible functions that assume all of their arguments are
CPU and non-autograd! The header looks like this:
```
namespace at {
namespace cpu {
CAFFE2_API Tensor & add_out(Tensor & out, const Tensor & self, const Tensor & other, Scalar alpha=1);
CAFFE2_API Tensor add(const Tensor & self, const Tensor & other, Scalar alpha=1);
CAFFE2_API Tensor & add_(Tensor & self, const Tensor & other, Scalar alpha=1);
CAFFE2_API Tensor & upsample_nearest1d_out(Tensor & out, const Tensor & self, IntArrayRef output_size, c10::optional<double> scales=c10::nullopt);
CAFFE2_API Tensor upsample_nearest1d(const Tensor & self, IntArrayRef output_size, c10::optional<double> scales=c10::nullopt);
CAFFE2_API Tensor & upsample_nearest1d_backward_out(Tensor & grad_input, const Tensor & grad_output, IntArrayRef output_size, IntArrayRef input_size, c10::optional<double> scales=c10::nullopt);
CAFFE2_API Tensor upsample_nearest1d_backward(const Tensor & grad_output, IntArrayRef output_size, IntArrayRef input_size, c10::optional<double> scales=c10::nullopt);
}}
```
This slows down static runtime because these are not the "allow
resize of nonzero tensor" variant binding (unlike the ones I had manually
written). We can restore this: it's a matter of adding codegen smarts to
do this, but I haven't done it just yet since it's marginally more
complicated.
In principle, non-structured kernels could get this treatment too.
But, like an evil mastermind, I'm withholding it from this patch, as an extra
carrot to get people to migrate to structured muahahahaha.
Signed-off-by: Edward Z. Yang <ezyang@fb.com>
Test Plan: Imported from OSS
Reviewed By: smessmer
Differential Revision: D25616105
Pulled By: ezyang
fbshipit-source-id: 84955ae09d0b373ca1ed05e0e4e0074a18d1a0b5
2021-01-22 21:09:34 +00:00
|
|
|
"aten/src/ATen/CUDAFunctions.h",
|
2021-06-25 01:07:33 +00:00
|
|
|
"aten/src/ATen/CUDAFunctions_inl.h",
|
bugfix: ensure that at::{dispatch_key}:: API gets external linkage (#58569)
Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/58569
This should allow external C++ files that aren't compiled into `libtorch.so`/`libtorch_cpu.so` (including all of fbcode) to use fast path functions like `at::cpu::add()`, which skip the dispatcher.
So, after spending way too much time trying to figure out why I was getting linker errors when calling `at::meta::{op}` and `at::cpu::{op}` from C++ test files, I realized that we're not including the header files for C++ for the namespaced operator definitions. I.e. `RegisterCPU.cpp`, which provides definitions for the `at::cpu::{op}` fast path functions, wasn't including the `CPUFunctions.h` header.
Why that breaks stuff: the `CPUFunctions.h` header file is what marks each function with the `TORCH_API` macro, so without including it, when we build `libtorch.so` and `libtorch_cpu.so`, the compiler will look at the definition in `RegisterCPU.cpp`, not see a `TORCH_API`, and decide that the function should get internal linkage.
An alternative would be to directly mark the function definitions in `RegisterCPU.cpp` with `TORCH_API`, but this seemed cleaner.
Test Plan: Imported from OSS
Reviewed By: pbelevich
Differential Revision: D28711300
Pulled By: bdhirsh
fbshipit-source-id: 535f245c20e977ff566d6da0757b3cefa137040b
2021-06-15 23:51:52 +00:00
|
|
|
"aten/src/ATen/CompositeExplicitAutogradFunctions.h",
|
2021-06-25 01:07:33 +00:00
|
|
|
"aten/src/ATen/CompositeExplicitAutogradFunctions_inl.h",
|
bugfix: ensure that at::{dispatch_key}:: API gets external linkage (#58569)
Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/58569
This should allow external C++ files that aren't compiled into `libtorch.so`/`libtorch_cpu.so` (including all of fbcode) to use fast path functions like `at::cpu::add()`, which skip the dispatcher.
So, after spending way too much time trying to figure out why I was getting linker errors when calling `at::meta::{op}` and `at::cpu::{op}` from C++ test files, I realized that we're not including the header files for C++ for the namespaced operator definitions. I.e. `RegisterCPU.cpp`, which provides definitions for the `at::cpu::{op}` fast path functions, wasn't including the `CPUFunctions.h` header.
Why that breaks stuff: the `CPUFunctions.h` header file is what marks each function with the `TORCH_API` macro, so without including it, when we build `libtorch.so` and `libtorch_cpu.so`, the compiler will look at the definition in `RegisterCPU.cpp`, not see a `TORCH_API`, and decide that the function should get internal linkage.
An alternative would be to directly mark the function definitions in `RegisterCPU.cpp` with `TORCH_API`, but this seemed cleaner.
Test Plan: Imported from OSS
Reviewed By: pbelevich
Differential Revision: D28711300
Pulled By: bdhirsh
fbshipit-source-id: 535f245c20e977ff566d6da0757b3cefa137040b
2021-06-15 23:51:52 +00:00
|
|
|
"aten/src/ATen/CompositeImplicitAutogradFunctions.h",
|
2021-06-25 01:07:33 +00:00
|
|
|
"aten/src/ATen/CompositeImplicitAutogradFunctions_inl.h",
|
Add a Functionalization pass in core (#64432)
Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/64432
Original PR description + feedback here: https://github.com/pytorch/pytorch/pull/63048
I've addressed all of the feedback in the original PR and made some pretty large changes, listed below.
**Table of Contents**
- Starting points
- List of the main changes from the original PR
- Next Steps
- Example codegen output (for a view, mutation, and view+mutation op)
**Starting Points**
A good place to start when looking through the PR:
* Alban mentioned that this is a useful mental model (thanks Ed for originally making this clear to me). Semantically, the pass currently does THREE things, which are all needed by functorch - all fused together into one big pass.
* (a) alias removal, which replaces {view} calls with {view}_copy calls, and manually tracks aliasing information, so that when one tensor is mutated, we re-apply the same mutation to all of the aliases. This is the bulk of the work - once this is done, the next 2 things are trivial to implement.
* (b) mutation removal, which is easy to do once we know that there are no aliases. Every mutation `a.add_(b)` becomes `a.replace_(a.add(b))`
* (c) reapplying views: all of the `{view}_copy` calls are replaced with `{view}` calls again. This is an optimization that we can make specifically for functorch (and strided backends), that only care about mutation removal and not alias removal
* XLA and Vulkan only want (a), or (a) + (b). Later, we'll want to split this out so that you can actually opt into different versions of this logic.
* There is currently no {view}_copy replacement, because the pass just <replace views with copies> and <replace copies with views> steps have been combined. Later, we'll want to actually implement {view}_copy variants of each view operator, probably with codegen.
* documentation breadcrumb 1, in `FunctionalTensorWrapper.cpp`: https://github.com/pytorch/pytorch/pull/64432/files#diff-a0bac99bf205dba5b94cb64fc2466d3d55d991887572f9cd6a02e27b3a91dd60R59 (you might have to expand the `FunctionalTensorWrapper.cpp` file, which GitHub closes by default because it's large)
* documentation breadcrumb 2, in `FunctionalTensorWrapper.h`: https://github.com/pytorch/pytorch/pull/64432/files#diff-c945c71a4ccac65871f24a912e8904f9a5088b24a32e636727ea9c8fe920708aR12
* Reading through the codegen output at the bottom of this description.
**Main changes from the original PR**
(1) I use lambdas instead of a giant enum to handle all of the different views.
This results in less boilerplate per view op (and more stuff that can be codegen'd). Every `ViewMeta` object now contains a `forward` and `reverse` lambda, that knows how to replay the view and its inverse. This makes the actual code that executes the replaying logic a lot less boilerplate-y (see `Alias::sync_update_operations` and `FunctionalTensorWrapper::sync_`)
(2) Every tensor during the functionalization pass is always wrapped in a `FunctionalTensorWrapper`.
This is potentially unnecessary for Vulkan/XLA, and will have a mild perf impact, but for now this PR just targets the functorch use case. I previously had a complicated design a (`FunctionalTensorImplBase` class) to avoid needing the wrapper for XLA, but it had some subtleties that are gonna require more thought to fix, so I'm pushing that off for now.
(3) `FunctionalTensorWrapper` objects accurately report stride information.
It's a little annoying to do this though, because the logic that calculates stride info for each view isn't easily separated from the actual view kernels in core, `at::native::{view}`. I do this by adding logic in each `at::functionalization::{view}` kernel to call the reference implementation `at::native::{view}`. I don't do anything with the output aside from taking it's size/stride/storage_offset to set the actual output tensor's size/stride/storage_offset correctly. There's another annoying part to this: I'm pretty sure that we want to pass in the actual *wrapper* tensors directly into the native kernels, not their inner unwrapped values. But there are some `at::native::{view}` kernels that call other tensor methods, which re-invokes the dispatcher, calling functionalization/functorch kernels that try do the unwrapping.
To do this, right now I have an `AutoDispatchDirectlyToNative` guard that basically ensures that any tensor methods called inside of the at::native::{view} op always redispatch straight to the CPU kernel (which will be another at::native:: kernel). This feels kind of heavy handed, but I'm not sure of a better way to do it.
(4) `FunctionalTensorWrapper` objects accurately report aliasing information.
There's a new `FunctionalStorageImpl` class (subclass of `StorageImpl`) that allows tensors in the functionalization pass to accurately alias storage. If two tensors `a` and `b` in a functionalized program are views of one another, then `a.storage.is_alias_of(b.storage)` should return true. I added this in a pretty similar way to how meta tensors allocate storage, although I don't pass in an actual allocator (I think this is fine because you should never resize a functional tensor's storage).
One thing I'm not sure about - should `FunctionalTensorWrapper` set `storage_access_should_throw_`: (a) always, (b) never, (c) only if its wrapped tensor has it set.
Right now I have it not set, mostly because calling the reference view functions (`at::native::{view}`) requires looking at the storage. But that means that if you try to access storage from python in a functionalized program, you'll get silent garbage instead of an error. Related question: are we planning on exposing meta tensor storage to python in the future (even though it contains garbage)?
(5) better docs :)
**View operator coverage**
(6) The functionalization pass now gets math-composite view ops for free.
I didn't add the `Functionalize` dispatch key to the composite set, because I don't want composite ops like `torch.ones` to get decomposed before hitting the functionalization pass. Instead, I added codegen to manually register the `at::native::` kernels of composite view ops. This is a little hairy, because the names of the `at::native::` kernels aren't easily accessible. They're stored in a `Dict[DispatchKey, BackendIndex]`. I made a best-effort attempt to get each view kernel's name, basically by assuming that every view op has either a composite or cpu implementation.
There's also a hardcoded list of composite view ops in `gen_inplace_or_view_type.py`, but it looks like it's wrong. This is probably worth rationalizing later, but instead I created a new list of the "complete" set of composite view ops, and preserved the old set by hardcoding the delta between the two sets.
(7) I've added codegen for ops that are both views AND mutations, like `transpose_()` (why do we even have these {emoji:1f622}).
From some light testing, it looks like they work correctly with one caveat: I had a hard time ensuring that functorch programs that mutate their inputs using ops like `transpose_()` preserve the input mutations after the program finishes running. For (in my corresponding functorch branch) I emit a warning when this happens, and just don't preserve the mutation
(8) I added `{view}_inverse` implementations for every view op, in `FunctionalInverses.cpp`.
These are needed to take mutations made to views and replay them back onto the base. To reduce boilerplate, the codegen generates function declarations for each `{view}_inverse` function, so you get a nice compiler error when someone eventually adds a new view op.
The only view ops currently not supported are (a) as_strided, and (b) the sparse view ops (values()/indices()).
I can add support for as_strided, but it needs an `as_strided_inverse()` function. That will look really similar to the `as_strided_backward()` function in FunctionsManual.cpp, but it has some noticeable differences: we basically want an `as_strided_embed` for autograd and `as_strided_scatter` for functionalization. We also will probably need them to be primitives w.r.t to autograd, since the currently implementation for autograd uses view().copy_() calls that XLA won't be able to handle. I'm wondering if anyone has any objections, but otherwise I can make those change (which will require writing backward formulas for `as_strided_embed` and `as_strided_scatter`).
I did a bunch of manual testing that all looks pretty good, but it's definitely not fully tested. Ed pointed out that once XLA uses this pass (or at least once there's a POC), we can just run the existing xla view test suite. Hopefully that delay is okay - if it's not, maybe we can think about using OpInfos similar to how functorch uses them for testing.
Note: there's some duplication with autograd's view code. Every `{view}_inverse` implementation is really similar to the implementation for that view listed in `derivatives.yaml`. There are some major differences though:
* the autograd implementations over those backwards functions (like `permute_backwards()`, in `FunctionsManual.cpp`) internally call other view ops. For functoinalization, we want them to (eventually call `{view}_copy` operators).
* For view ops that take a subset of the original storage, like `slice/select/diagonal/as_strided()`, the autograd backward functions fill the "spaces" in the inverse call with zeroes. For functionalizations, we want to fill them with the value of `base` at those positions. It looks like this currently applies to 6 total ops (since we can ignore composites):
* select
* slice
* diagonal
* as_stridied
* split
* split_with_sizes
A nice end state would probably be for the autograd + functoinalization codegen to both look at the same yaml (either `derivatives.yaml`, or something else), and automatically generate the right thing. I didn't leave that in scope for this PR though.
**Current State + Next Steps**
There are a bunch of followups after this PR eventually lands. Roughly in order:
* Use the current pass to register problematic composite ops in functorch. Also, nested `functionalize()` calls aren't supported yet (I mostly just need to remove some debug asserts and test it).
* Work on freeing up dispatch key space in the by deduplicating the `{backend}`/`Autograd{backend}`/`Sparse{backend}`/`Quantized{backend}` keys
* Once we have more dispatch keys, split up this pass into 3 pieces - it's currently fused, and doesn't do the right thing for vulkan/XLA. Specifically, all of the `{view}` calls in the current pass's view-replay logic should turn into `{view}_copy` calls that vulkan/XLA know how to implement, and there will be separate passes for (a) removing mutations, and (b) turning `{view}_copy` calls back into `{view}` calls. For Vulkan, we eventually want a pass that ONLY removes aliasing and view calls, and doesn't remove mutations. We can also probably make the 2 new passes user dispatch keys to save dispatch key space, if they'll only be used by functorch anyway.
* Do more of a dive on perf for the vulkan/xla use cases. There are several areas to improve perf with varying levels of effort required. The simplest one that I'll probably do regardless is to codegen the out-of-place kernels instead of using a boxed fallback. Getting a POC working for xla will also be useful to test the view operator coverage.
**Example Codegen Output**
View Op:
```
::std::vector<at::Tensor> split_Tensor(c10::DispatchKeySet ks, const at::Tensor & self, int64_t split_size, int64_t dim) {
auto self_ = at::functionalization::impl::unwrapFunctionalTensor(self);
::std::vector<at::Tensor> out;
{
at::AutoDispatchBelowFunctionalize guard;
auto tmp_output = at::redispatch::split(ks & c10::after_func_keyset, self_, split_size, dim);
out = at::functionalization::impl::wrapFunctionalTensor(tmp_output);
// I'm fusing the [alias removal], [mutation removal], [add views back] passes together.
// Later, we'll want to turn them into separate passes (since e.g. vulkan only cares about alias removal).
}
at::functionalization::ViewMeta view_meta = at::functionalization::ViewMeta(
[split_size, dim](const at::Tensor& base, int64_t mutated_view_idx) -> at::Tensor {
return base.split(split_size, dim)[mutated_view_idx];
},
[split_size, dim](const at::Tensor& base, const at::Tensor& mutated_view, int64_t mutated_view_idx) -> at::Tensor {
return at::functionalization::impl::split_inverse(base, mutated_view, mutated_view_idx, split_size, dim);
}
);
at::functionalization::impl::set_view_meta(out, self, view_meta);
at::AutoDispatchDirectlyToNative native_guard;
::std::vector<at::Tensor> reference_tensor_output = at::native::split(self, split_size, dim);
at::functionalization::impl::set_strides(out, reference_tensor_output);
return out;
}
```
Mutation Op:
```
at::Tensor & add__Tensor(c10::DispatchKeySet ks, at::Tensor & self, const at::Tensor & other, const at::Scalar & alpha) {
at::functionalization::impl::sync(self);
at::functionalization::impl::sync(other);
auto self_ = at::functionalization::impl::unwrapFunctionalTensor(self);
auto other_ = at::functionalization::impl::unwrapFunctionalTensor(other);
at::Tensor tmp_output;
{
at::AutoDispatchBelowFunctionalize guard;
// The functionalization pass explicitly doesn't pass out= parameters to the redispatch
tmp_output = at::redispatch::add(
ks & c10::after_func_keyset, self_, other_, alpha);
}
self.replace_(tmp_output);
at::functionalization::impl::maybe_add_update(self);
return self;
}
```
View + Mutation Op:
```
at::Tensor & transpose_(c10::DispatchKeySet ks, at::Tensor & self, int64_t dim0, int64_t dim1) {
at::functionalization::ViewMeta view_meta = at::functionalization::ViewMeta(
[dim0, dim1](const at::Tensor& base, int64_t mutated_view_idx) -> at::Tensor {
return base.transpose(dim0, dim1);
},
[dim0, dim1](const at::Tensor& base, const at::Tensor& mutated_view, int64_t mutated_view_idx) -> at::Tensor {
return at::functionalization::impl::transpose_inverse(base, mutated_view, dim0, dim1);
}
);
at::functionalization::impl::mutate_view_meta(self, view_meta);
// See Note [Propagating strides in the functionalization pass]
// Directly update the sizes/strides/storage_offset fields on self using the inplace call.
// I need the guard because I don't want the at::native kernel to end up calling more functionalization/functorch kernels.
// Its only job is to directly compute the output size/stride/storage_offset metadata.
at::AutoDispatchDirectlyToNative native_guard;
at::native::transpose_(self, dim0, dim1);
return self;
}
```
Test Plan: Imported from OSS
Reviewed By: albanD
Differential Revision: D31942093
Pulled By: bdhirsh
fbshipit-source-id: b95598dae35dd1842fa8b1d8d1448332f3afaadf
2021-10-28 17:43:11 +00:00
|
|
|
"aten/src/ATen/FunctionalInverses.h",
|
2020-04-07 05:48:33 +00:00
|
|
|
"aten/src/ATen/Functions.h",
|
2021-06-17 21:26:51 +00:00
|
|
|
"aten/src/ATen/Functions.cpp",
|
2021-02-22 18:48:34 +00:00
|
|
|
"aten/src/ATen/RedispatchFunctions.h",
|
2021-06-02 00:18:02 +00:00
|
|
|
"aten/src/ATen/Operators.h",
|
2021-08-09 23:15:54 +00:00
|
|
|
"aten/src/ATen/Operators_0.cpp",
|
|
|
|
|
"aten/src/ATen/Operators_1.cpp",
|
|
|
|
|
"aten/src/ATen/Operators_2.cpp",
|
|
|
|
|
"aten/src/ATen/Operators_3.cpp",
|
|
|
|
|
"aten/src/ATen/Operators_4.cpp",
|
2020-04-07 05:48:33 +00:00
|
|
|
"aten/src/ATen/NativeFunctions.h",
|
Structured kernel definitions (#45277)
Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45277
Implements structured kernels as per https://github.com/pytorch/rfcs/pull/9 and ports upsample_nearest1d to use the framework.
The general structure of this diff:
- Define a new syntax for specifying structured kernels in `native_functions.yaml`. You put `structured: True` on the `out` function (that's what you implement) and `structured_delegate: foo.out` on the functional/inplace variants to define them in terms of the `out` function. There's a bunch of new consistency checking to see if you've done this right, though the error messages are of varying quality. This is most of what's going on in tools.codegen.model
- NativeFunctionGroup turns into StructuredNativeFunctions. Previously I thought that maybe we would use this grouping mechanism for both structured and unstructured kernels, but it turned out that Jiakai needed to make his own grouping structure. So now I've specialized it for structured kernels, which also means I get to add a bunch of invariants, like requiring structured kernels to have both a functional and an out variant. This is the lower bundle of changes in tools.codegen.model
- When you make an out kernel structured, this induces us to generate a new meta function signature for you to write shape checking and output allocation code. The signatures of these is defined by `tools.codegen.api.meta` and generated into `MetaFunctions.h`. Coverage here is very bare bones and will be driven by actual operators we port as we go.
- The meaty part of code generation is what we do when we have some grouped StructuredNativeFunctions. We continue to generate a wrapper per function type, but they're are a bit different as the call your meta functions, and make reference to the actual implementations in out.
- Then there's a port of `upsample_nearest1d`; easiest to review by just looking at what the final code looks like.
Missing pieces:
- Stride calculation in TensorMeta
- Sufficient sanity checking for inplace/out variants
- Enough rope to make TensorIterator work
This PR improves instruction counts on `upsample_nearest1d` because it eliminates an extra redispatch. Testing `at::upsample_nearest1d(x, {10});`
* Functional: before 1314105, after 1150705
* Out: before 915705, after 838405
These numbers may be jittered up to +-16400 (which is the difference when I tested against an unaffected operator `at::upsample_linear1d`), though that may also because unrelated changes affected all operators globally.
Signed-off-by: Edward Z. Yang <ezyang@fb.com>
Differential Revision: D24253555
Test Plan: Imported from OSS
Reviewed By: smessmer
Pulled By: ezyang
fbshipit-source-id: 4ef58dd911991060f13576864c8171f9cc614456
2020-11-17 23:23:03 +00:00
|
|
|
"aten/src/ATen/MetaFunctions.h",
|
2021-06-25 01:07:33 +00:00
|
|
|
"aten/src/ATen/MetaFunctions_inl.h",
|
2021-06-15 23:51:52 +00:00
|
|
|
"aten/src/ATen/NativeMetaFunctions.h",
|
2020-04-07 05:48:33 +00:00
|
|
|
"aten/src/ATen/core/TensorBody.h",
|
2021-06-17 21:26:51 +00:00
|
|
|
"aten/src/ATen/core/TensorMethods.cpp",
|
2020-04-21 20:27:59 +00:00
|
|
|
"aten/src/ATen/core/ATenOpList.cpp",
|
2020-04-07 05:48:33 +00:00
|
|
|
],
|
2020-08-31 15:58:32 +00:00
|
|
|
cmd = "$(location :gen) --source-path aten/src/ATen --install_dir `dirname $(location aten/src/ATen/Declarations.yaml)`",
|
2020-04-07 05:48:33 +00:00
|
|
|
tools = [":gen"],
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
py_library(
|
2020-08-31 15:58:32 +00:00
|
|
|
name = "tools_codegen",
|
|
|
|
|
srcs = glob(["tools/codegen/**/*.py"]),
|
2020-04-07 05:48:33 +00:00
|
|
|
)
|
|
|
|
|
|
|
|
|
|
py_library(
|
|
|
|
|
name = "tools_autograd",
|
|
|
|
|
srcs = glob(["tools/autograd/*.py"]),
|
|
|
|
|
data = glob([
|
|
|
|
|
"tools/autograd/*.yaml",
|
|
|
|
|
"tools/autograd/templates/*",
|
|
|
|
|
]),
|
2020-08-31 15:58:32 +00:00
|
|
|
deps = [":tools_codegen"],
|
2020-04-07 05:48:33 +00:00
|
|
|
)
|
|
|
|
|
|
|
|
|
|
py_library(
|
|
|
|
|
name = "tools_jit",
|
|
|
|
|
srcs = glob(["tools/jit/*.py"]),
|
|
|
|
|
data = glob(["tools/jit/templates/*"]),
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
py_binary(
|
|
|
|
|
name = "generate_code",
|
|
|
|
|
srcs = ["tools/setup_helpers/generate_code.py"],
|
|
|
|
|
deps = [
|
|
|
|
|
":tools_autograd",
|
|
|
|
|
":tools_jit",
|
|
|
|
|
],
|
|
|
|
|
)
|
|
|
|
|
|
2020-05-20 05:49:28 +00:00
|
|
|
libtorch_cpp_generated_sources = [
|
2020-04-07 05:48:33 +00:00
|
|
|
"torch/csrc/autograd/generated/VariableType.h",
|
|
|
|
|
"torch/csrc/autograd/generated/VariableType_0.cpp",
|
|
|
|
|
"torch/csrc/autograd/generated/VariableType_1.cpp",
|
|
|
|
|
"torch/csrc/autograd/generated/VariableType_2.cpp",
|
|
|
|
|
"torch/csrc/autograd/generated/VariableType_3.cpp",
|
|
|
|
|
"torch/csrc/autograd/generated/VariableType_4.cpp",
|
|
|
|
|
# "torch/csrc/autograd/generated/VariableTypeEverything.cpp",
|
[pytorch] move tracing logic to a separate dispatch backend (#38467)
Summary:
This PR moves tracing logic out of the generated VariableType kernels, to associate it with a new dedicated dispatch key Tracer.
It also toggles the dispatch key set at various places to keep the semantics unchanged - see the inline [Tracing Mode Switches] note.
Sample generated code:
```
Tensor & __ilshift___Tensor(Tensor & self, const Tensor & other) {
#if !defined(PYTORCH_DISABLE_TRACING)
torch::jit::Node* node = nullptr;
std::shared_ptr<jit::tracer::TracingState> tracer_state;
if (jit::tracer::isTracing()) {
tracer_state = jit::tracer::getTracingState();
at::Symbol op_name;
op_name = jit::Symbol::fromQualString("aten::__ilshift__");
node = tracer_state->graph->create(op_name, /*num_outputs=*/0);
jit::tracer::recordSourceLocation(node);
jit::tracer::addInputs(node, "self", self);
jit::tracer::addInputs(node, "other", other);
tracer_state->graph->insertNode(node);
jit::tracer::setTracingState(nullptr);
}
#endif
static auto op = c10::Dispatcher::singleton().findSchemaOrThrow("aten::__ilshift__", "Tensor");
c10::Dispatcher::singleton().redispatch<Tensor &, Tensor &, const Tensor &>(op, c10::DispatchKey::Tracer, self, other);
#if !defined(PYTORCH_DISABLE_TRACING)
if (tracer_state) {
jit::tracer::setTracingState(std::move(tracer_state));
jit::tracer::addOutput(node, self);
}
#endif
return self;
}
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/38467
ghstack-source-id: 105215150
Test Plan: CI
Differential Revision: D21570684
fbshipit-source-id: 1a96761830307f9a934f38bfb9fe8b5b1763e0e0
2020-06-04 08:49:27 +00:00
|
|
|
"torch/csrc/autograd/generated/TraceType_0.cpp",
|
|
|
|
|
"torch/csrc/autograd/generated/TraceType_1.cpp",
|
|
|
|
|
"torch/csrc/autograd/generated/TraceType_2.cpp",
|
|
|
|
|
"torch/csrc/autograd/generated/TraceType_3.cpp",
|
|
|
|
|
"torch/csrc/autograd/generated/TraceType_4.cpp",
|
|
|
|
|
# "torch/csrc/autograd/generated/TraceTypeEverything.cpp",
|
2021-05-02 05:55:12 +00:00
|
|
|
"torch/csrc/autograd/generated/ADInplaceOrViewType_0.cpp",
|
|
|
|
|
"torch/csrc/autograd/generated/ADInplaceOrViewType_1.cpp",
|
|
|
|
|
# "torch/csrc/autograd/generated/ADInplaceOrViewTypeEverything.cpp",
|
2020-04-07 05:48:33 +00:00
|
|
|
"torch/csrc/autograd/generated/Functions.h",
|
|
|
|
|
"torch/csrc/autograd/generated/Functions.cpp",
|
|
|
|
|
"torch/csrc/autograd/generated/variable_factories.h",
|
2020-05-20 05:49:28 +00:00
|
|
|
]
|
|
|
|
|
|
|
|
|
|
libtorch_python_generated_sources = [
|
|
|
|
|
"torch/csrc/autograd/generated/python_functions.h",
|
2021-08-11 15:44:08 +00:00
|
|
|
"torch/csrc/autograd/generated/python_functions_0.cpp",
|
|
|
|
|
"torch/csrc/autograd/generated/python_functions_1.cpp",
|
|
|
|
|
"torch/csrc/autograd/generated/python_functions_2.cpp",
|
|
|
|
|
"torch/csrc/autograd/generated/python_functions_3.cpp",
|
|
|
|
|
"torch/csrc/autograd/generated/python_functions_4.cpp",
|
2020-05-20 05:49:28 +00:00
|
|
|
"torch/csrc/autograd/generated/python_variable_methods.cpp",
|
2021-08-25 22:05:14 +00:00
|
|
|
"torch/csrc/autograd/generated/python_torch_functions_0.cpp",
|
|
|
|
|
"torch/csrc/autograd/generated/python_torch_functions_1.cpp",
|
|
|
|
|
"torch/csrc/autograd/generated/python_torch_functions_2.cpp",
|
2020-05-20 05:49:28 +00:00
|
|
|
"torch/csrc/autograd/generated/python_nn_functions.cpp",
|
2020-08-06 07:18:51 +00:00
|
|
|
"torch/csrc/autograd/generated/python_fft_functions.cpp",
|
2020-08-07 17:16:56 +00:00
|
|
|
"torch/csrc/autograd/generated/python_linalg_functions.cpp",
|
2021-11-20 03:45:55 +00:00
|
|
|
"torch/csrc/autograd/generated/python_sparse_functions.cpp",
|
2021-03-04 08:00:09 +00:00
|
|
|
"torch/csrc/autograd/generated/python_special_functions.cpp",
|
expose return_types in Python (#66614)
Summary:
https://github.com/facebookresearch/functorch/issues/87
TODO:
* [x] Add comments
* [x] Add test
* [x] Fix XLA
<details>
<summary>Generated python_return_types.cpp</summary>
```cpp
#include <Python.h>
#include <vector>
#include <map>
#include <string>
#include "torch/csrc/autograd/python_return_types.h"
#include "torch/csrc/utils/structseq.h"
#include "torch/csrc/Exceptions.h"
namespace {
PyTypeObject* get__det_lu_based_helper_namedtuple() {
static PyStructSequence_Field NamedTuple_fields[] = { {"det", ""}, {"lu", ""}, {"pivs", ""}, {nullptr} };
static PyTypeObject _det_lu_based_helperNamedTuple;
static bool is_initialized = false;
static PyStructSequence_Desc desc = { "torch.return_types._det_lu_based_helper", nullptr, NamedTuple_fields, 3 };
if (!is_initialized) {
PyStructSequence_InitType(&_det_lu_based_helperNamedTuple, &desc);
_det_lu_based_helperNamedTuple.tp_repr = (reprfunc)torch::utils::returned_structseq_repr;
is_initialized = true;
}
return &_det_lu_based_helperNamedTuple;
}
PyTypeObject* get__fake_quantize_per_tensor_affine_cachemask_tensor_qparams_namedtuple() {
static PyStructSequence_Field NamedTuple_fields[] = { {"output", ""}, {"mask", ""}, {nullptr} };
static PyTypeObject _fake_quantize_per_tensor_affine_cachemask_tensor_qparamsNamedTuple;
static bool is_initialized = false;
static PyStructSequence_Desc desc = { "torch.return_types._fake_quantize_per_tensor_affine_cachemask_tensor_qparams", nullptr, NamedTuple_fields, 2 };
if (!is_initialized) {
PyStructSequence_InitType(&_fake_quantize_per_tensor_affine_cachemask_tensor_qparamsNamedTuple, &desc);
_fake_quantize_per_tensor_affine_cachemask_tensor_qparamsNamedTuple.tp_repr = (reprfunc)torch::utils::returned_structseq_repr;
is_initialized = true;
}
return &_fake_quantize_per_tensor_affine_cachemask_tensor_qparamsNamedTuple;
}
PyTypeObject* get__fused_moving_avg_obs_fq_helper_namedtuple() {
static PyStructSequence_Field NamedTuple_fields[] = { {"output", ""}, {"mask", ""}, {nullptr} };
static PyTypeObject _fused_moving_avg_obs_fq_helperNamedTuple;
static bool is_initialized = false;
static PyStructSequence_Desc desc = { "torch.return_types._fused_moving_avg_obs_fq_helper", nullptr, NamedTuple_fields, 2 };
if (!is_initialized) {
PyStructSequence_InitType(&_fused_moving_avg_obs_fq_helperNamedTuple, &desc);
_fused_moving_avg_obs_fq_helperNamedTuple.tp_repr = (reprfunc)torch::utils::returned_structseq_repr;
is_initialized = true;
}
return &_fused_moving_avg_obs_fq_helperNamedTuple;
}
PyTypeObject* get__lu_with_info_namedtuple() {
static PyStructSequence_Field NamedTuple_fields[] = { {"LU", ""}, {"pivots", ""}, {"info", ""}, {nullptr} };
static PyTypeObject _lu_with_infoNamedTuple;
static bool is_initialized = false;
static PyStructSequence_Desc desc = { "torch.return_types._lu_with_info", nullptr, NamedTuple_fields, 3 };
if (!is_initialized) {
PyStructSequence_InitType(&_lu_with_infoNamedTuple, &desc);
_lu_with_infoNamedTuple.tp_repr = (reprfunc)torch::utils::returned_structseq_repr;
is_initialized = true;
}
return &_lu_with_infoNamedTuple;
}
PyTypeObject* get__unpack_dual_namedtuple() {
static PyStructSequence_Field NamedTuple_fields[] = { {"primal", ""}, {"tangent", ""}, {nullptr} };
static PyTypeObject _unpack_dualNamedTuple;
static bool is_initialized = false;
static PyStructSequence_Desc desc = { "torch.return_types._unpack_dual", nullptr, NamedTuple_fields, 2 };
if (!is_initialized) {
PyStructSequence_InitType(&_unpack_dualNamedTuple, &desc);
_unpack_dualNamedTuple.tp_repr = (reprfunc)torch::utils::returned_structseq_repr;
is_initialized = true;
}
return &_unpack_dualNamedTuple;
}
PyTypeObject* get_aminmax_namedtuple() {
static PyStructSequence_Field NamedTuple_fields[] = { {"min", ""}, {"max", ""}, {nullptr} };
static PyTypeObject aminmaxNamedTuple;
static bool is_initialized = false;
static PyStructSequence_Desc desc = { "torch.return_types.aminmax", nullptr, NamedTuple_fields, 2 };
if (!is_initialized) {
PyStructSequence_InitType(&aminmaxNamedTuple, &desc);
aminmaxNamedTuple.tp_repr = (reprfunc)torch::utils::returned_structseq_repr;
is_initialized = true;
}
return &aminmaxNamedTuple;
}
PyTypeObject* get_aminmax_out_namedtuple() {
static PyStructSequence_Field NamedTuple_fields[] = { {"min", ""}, {"max", ""}, {nullptr} };
static PyTypeObject aminmax_outNamedTuple1;
static bool is_initialized = false;
static PyStructSequence_Desc desc = { "torch.return_types.aminmax_out", nullptr, NamedTuple_fields, 2 };
if (!is_initialized) {
PyStructSequence_InitType(&aminmax_outNamedTuple1, &desc);
aminmax_outNamedTuple1.tp_repr = (reprfunc)torch::utils::returned_structseq_repr;
is_initialized = true;
}
return &aminmax_outNamedTuple1;
}
PyTypeObject* get_cummax_namedtuple() {
static PyStructSequence_Field NamedTuple_fields[] = { {"values", ""}, {"indices", ""}, {nullptr} };
static PyTypeObject cummaxNamedTuple;
static bool is_initialized = false;
static PyStructSequence_Desc desc = { "torch.return_types.cummax", nullptr, NamedTuple_fields, 2 };
if (!is_initialized) {
PyStructSequence_InitType(&cummaxNamedTuple, &desc);
cummaxNamedTuple.tp_repr = (reprfunc)torch::utils::returned_structseq_repr;
is_initialized = true;
}
return &cummaxNamedTuple;
}
PyTypeObject* get_cummax_out_namedtuple() {
static PyStructSequence_Field NamedTuple_fields[] = { {"values", ""}, {"indices", ""}, {nullptr} };
static PyTypeObject cummax_outNamedTuple1;
static bool is_initialized = false;
static PyStructSequence_Desc desc = { "torch.return_types.cummax_out", nullptr, NamedTuple_fields, 2 };
if (!is_initialized) {
PyStructSequence_InitType(&cummax_outNamedTuple1, &desc);
cummax_outNamedTuple1.tp_repr = (reprfunc)torch::utils::returned_structseq_repr;
is_initialized = true;
}
return &cummax_outNamedTuple1;
}
PyTypeObject* get_cummin_namedtuple() {
static PyStructSequence_Field NamedTuple_fields[] = { {"values", ""}, {"indices", ""}, {nullptr} };
static PyTypeObject cumminNamedTuple;
static bool is_initialized = false;
static PyStructSequence_Desc desc = { "torch.return_types.cummin", nullptr, NamedTuple_fields, 2 };
if (!is_initialized) {
PyStructSequence_InitType(&cumminNamedTuple, &desc);
cumminNamedTuple.tp_repr = (reprfunc)torch::utils::returned_structseq_repr;
is_initialized = true;
}
return &cumminNamedTuple;
}
PyTypeObject* get_cummin_out_namedtuple() {
static PyStructSequence_Field NamedTuple_fields[] = { {"values", ""}, {"indices", ""}, {nullptr} };
static PyTypeObject cummin_outNamedTuple1;
static bool is_initialized = false;
static PyStructSequence_Desc desc = { "torch.return_types.cummin_out", nullptr, NamedTuple_fields, 2 };
if (!is_initialized) {
PyStructSequence_InitType(&cummin_outNamedTuple1, &desc);
cummin_outNamedTuple1.tp_repr = (reprfunc)torch::utils::returned_structseq_repr;
is_initialized = true;
}
return &cummin_outNamedTuple1;
}
PyTypeObject* get_eig_out_namedtuple() {
static PyStructSequence_Field NamedTuple_fields[] = { {"eigenvalues", ""}, {"eigenvectors", ""}, {nullptr} };
static PyTypeObject eig_outNamedTuple;
static bool is_initialized = false;
static PyStructSequence_Desc desc = { "torch.return_types.eig_out", nullptr, NamedTuple_fields, 2 };
if (!is_initialized) {
PyStructSequence_InitType(&eig_outNamedTuple, &desc);
eig_outNamedTuple.tp_repr = (reprfunc)torch::utils::returned_structseq_repr;
is_initialized = true;
}
return &eig_outNamedTuple;
}
PyTypeObject* get_eig_namedtuple() {
static PyStructSequence_Field NamedTuple_fields[] = { {"eigenvalues", ""}, {"eigenvectors", ""}, {nullptr} };
static PyTypeObject eigNamedTuple1;
static bool is_initialized = false;
static PyStructSequence_Desc desc = { "torch.return_types.eig", nullptr, NamedTuple_fields, 2 };
if (!is_initialized) {
PyStructSequence_InitType(&eigNamedTuple1, &desc);
eigNamedTuple1.tp_repr = (reprfunc)torch::utils::returned_structseq_repr;
is_initialized = true;
}
return &eigNamedTuple1;
}
PyTypeObject* get_frexp_namedtuple() {
static PyStructSequence_Field NamedTuple_fields[] = { {"mantissa", ""}, {"exponent", ""}, {nullptr} };
static PyTypeObject frexpNamedTuple;
static bool is_initialized = false;
static PyStructSequence_Desc desc = { "torch.return_types.frexp", nullptr, NamedTuple_fields, 2 };
if (!is_initialized) {
PyStructSequence_InitType(&frexpNamedTuple, &desc);
frexpNamedTuple.tp_repr = (reprfunc)torch::utils::returned_structseq_repr;
is_initialized = true;
}
return &frexpNamedTuple;
}
PyTypeObject* get_frexp_out_namedtuple() {
static PyStructSequence_Field NamedTuple_fields[] = { {"mantissa", ""}, {"exponent", ""}, {nullptr} };
static PyTypeObject frexp_outNamedTuple1;
static bool is_initialized = false;
static PyStructSequence_Desc desc = { "torch.return_types.frexp_out", nullptr, NamedTuple_fields, 2 };
if (!is_initialized) {
PyStructSequence_InitType(&frexp_outNamedTuple1, &desc);
frexp_outNamedTuple1.tp_repr = (reprfunc)torch::utils::returned_structseq_repr;
is_initialized = true;
}
return &frexp_outNamedTuple1;
}
PyTypeObject* get_geqrf_out_namedtuple() {
static PyStructSequence_Field NamedTuple_fields[] = { {"a", ""}, {"tau", ""}, {nullptr} };
static PyTypeObject geqrf_outNamedTuple;
static bool is_initialized = false;
static PyStructSequence_Desc desc = { "torch.return_types.geqrf_out", nullptr, NamedTuple_fields, 2 };
if (!is_initialized) {
PyStructSequence_InitType(&geqrf_outNamedTuple, &desc);
geqrf_outNamedTuple.tp_repr = (reprfunc)torch::utils::returned_structseq_repr;
is_initialized = true;
}
return &geqrf_outNamedTuple;
}
PyTypeObject* get_geqrf_namedtuple() {
static PyStructSequence_Field NamedTuple_fields[] = { {"a", ""}, {"tau", ""}, {nullptr} };
static PyTypeObject geqrfNamedTuple1;
static bool is_initialized = false;
static PyStructSequence_Desc desc = { "torch.return_types.geqrf", nullptr, NamedTuple_fields, 2 };
if (!is_initialized) {
PyStructSequence_InitType(&geqrfNamedTuple1, &desc);
geqrfNamedTuple1.tp_repr = (reprfunc)torch::utils::returned_structseq_repr;
is_initialized = true;
}
return &geqrfNamedTuple1;
}
PyTypeObject* get_histogram_out_namedtuple() {
static PyStructSequence_Field NamedTuple_fields[] = { {"hist", ""}, {"bin_edges", ""}, {nullptr} };
static PyTypeObject histogram_outNamedTuple;
static bool is_initialized = false;
static PyStructSequence_Desc desc = { "torch.return_types.histogram_out", nullptr, NamedTuple_fields, 2 };
if (!is_initialized) {
PyStructSequence_InitType(&histogram_outNamedTuple, &desc);
histogram_outNamedTuple.tp_repr = (reprfunc)torch::utils::returned_structseq_repr;
is_initialized = true;
}
return &histogram_outNamedTuple;
}
PyTypeObject* get_histogram_namedtuple() {
static PyStructSequence_Field NamedTuple_fields[] = { {"hist", ""}, {"bin_edges", ""}, {nullptr} };
static PyTypeObject histogramNamedTuple1;
static bool is_initialized = false;
static PyStructSequence_Desc desc = { "torch.return_types.histogram", nullptr, NamedTuple_fields, 2 };
if (!is_initialized) {
PyStructSequence_InitType(&histogramNamedTuple1, &desc);
histogramNamedTuple1.tp_repr = (reprfunc)torch::utils::returned_structseq_repr;
is_initialized = true;
}
return &histogramNamedTuple1;
}
PyTypeObject* get_kthvalue_namedtuple() {
static PyStructSequence_Field NamedTuple_fields[] = { {"values", ""}, {"indices", ""}, {nullptr} };
static PyTypeObject kthvalueNamedTuple;
static bool is_initialized = false;
static PyStructSequence_Desc desc = { "torch.return_types.kthvalue", nullptr, NamedTuple_fields, 2 };
if (!is_initialized) {
PyStructSequence_InitType(&kthvalueNamedTuple, &desc);
kthvalueNamedTuple.tp_repr = (reprfunc)torch::utils::returned_structseq_repr;
is_initialized = true;
}
return &kthvalueNamedTuple;
}
PyTypeObject* get_kthvalue_out_namedtuple() {
static PyStructSequence_Field NamedTuple_fields[] = { {"values", ""}, {"indices", ""}, {nullptr} };
static PyTypeObject kthvalue_outNamedTuple1;
static bool is_initialized = false;
static PyStructSequence_Desc desc = { "torch.return_types.kthvalue_out", nullptr, NamedTuple_fields, 2 };
if (!is_initialized) {
PyStructSequence_InitType(&kthvalue_outNamedTuple1, &desc);
kthvalue_outNamedTuple1.tp_repr = (reprfunc)torch::utils::returned_structseq_repr;
is_initialized = true;
}
return &kthvalue_outNamedTuple1;
}
PyTypeObject* get_linalg_cholesky_ex_namedtuple() {
static PyStructSequence_Field NamedTuple_fields[] = { {"L", ""}, {"info", ""}, {nullptr} };
static PyTypeObject linalg_cholesky_exNamedTuple;
static bool is_initialized = false;
static PyStructSequence_Desc desc = { "torch.return_types.linalg_cholesky_ex", nullptr, NamedTuple_fields, 2 };
if (!is_initialized) {
PyStructSequence_InitType(&linalg_cholesky_exNamedTuple, &desc);
linalg_cholesky_exNamedTuple.tp_repr = (reprfunc)torch::utils::returned_structseq_repr;
is_initialized = true;
}
return &linalg_cholesky_exNamedTuple;
}
PyTypeObject* get_linalg_cholesky_ex_out_namedtuple() {
static PyStructSequence_Field NamedTuple_fields[] = { {"L", ""}, {"info", ""}, {nullptr} };
static PyTypeObject linalg_cholesky_ex_outNamedTuple1;
static bool is_initialized = false;
static PyStructSequence_Desc desc = { "torch.return_types.linalg_cholesky_ex_out", nullptr, NamedTuple_fields, 2 };
if (!is_initialized) {
PyStructSequence_InitType(&linalg_cholesky_ex_outNamedTuple1, &desc);
linalg_cholesky_ex_outNamedTuple1.tp_repr = (reprfunc)torch::utils::returned_structseq_repr;
is_initialized = true;
}
return &linalg_cholesky_ex_outNamedTuple1;
}
PyTypeObject* get_linalg_eig_namedtuple() {
static PyStructSequence_Field NamedTuple_fields[] = { {"eigenvalues", ""}, {"eigenvectors", ""}, {nullptr} };
static PyTypeObject linalg_eigNamedTuple;
static bool is_initialized = false;
static PyStructSequence_Desc desc = { "torch.return_types.linalg_eig", nullptr, NamedTuple_fields, 2 };
if (!is_initialized) {
PyStructSequence_InitType(&linalg_eigNamedTuple, &desc);
linalg_eigNamedTuple.tp_repr = (reprfunc)torch::utils::returned_structseq_repr;
is_initialized = true;
}
return &linalg_eigNamedTuple;
}
PyTypeObject* get_linalg_eig_out_namedtuple() {
static PyStructSequence_Field NamedTuple_fields[] = { {"eigenvalues", ""}, {"eigenvectors", ""}, {nullptr} };
static PyTypeObject linalg_eig_outNamedTuple1;
static bool is_initialized = false;
static PyStructSequence_Desc desc = { "torch.return_types.linalg_eig_out", nullptr, NamedTuple_fields, 2 };
if (!is_initialized) {
PyStructSequence_InitType(&linalg_eig_outNamedTuple1, &desc);
linalg_eig_outNamedTuple1.tp_repr = (reprfunc)torch::utils::returned_structseq_repr;
is_initialized = true;
}
return &linalg_eig_outNamedTuple1;
}
PyTypeObject* get_linalg_eigh_namedtuple() {
static PyStructSequence_Field NamedTuple_fields[] = { {"eigenvalues", ""}, {"eigenvectors", ""}, {nullptr} };
static PyTypeObject linalg_eighNamedTuple;
static bool is_initialized = false;
static PyStructSequence_Desc desc = { "torch.return_types.linalg_eigh", nullptr, NamedTuple_fields, 2 };
if (!is_initialized) {
PyStructSequence_InitType(&linalg_eighNamedTuple, &desc);
linalg_eighNamedTuple.tp_repr = (reprfunc)torch::utils::returned_structseq_repr;
is_initialized = true;
}
return &linalg_eighNamedTuple;
}
PyTypeObject* get_linalg_eigh_out_namedtuple() {
static PyStructSequence_Field NamedTuple_fields[] = { {"eigenvalues", ""}, {"eigenvectors", ""}, {nullptr} };
static PyTypeObject linalg_eigh_outNamedTuple1;
static bool is_initialized = false;
static PyStructSequence_Desc desc = { "torch.return_types.linalg_eigh_out", nullptr, NamedTuple_fields, 2 };
if (!is_initialized) {
PyStructSequence_InitType(&linalg_eigh_outNamedTuple1, &desc);
linalg_eigh_outNamedTuple1.tp_repr = (reprfunc)torch::utils::returned_structseq_repr;
is_initialized = true;
}
return &linalg_eigh_outNamedTuple1;
}
PyTypeObject* get_linalg_inv_ex_namedtuple() {
static PyStructSequence_Field NamedTuple_fields[] = { {"inverse", ""}, {"info", ""}, {nullptr} };
static PyTypeObject linalg_inv_exNamedTuple;
static bool is_initialized = false;
static PyStructSequence_Desc desc = { "torch.return_types.linalg_inv_ex", nullptr, NamedTuple_fields, 2 };
if (!is_initialized) {
PyStructSequence_InitType(&linalg_inv_exNamedTuple, &desc);
linalg_inv_exNamedTuple.tp_repr = (reprfunc)torch::utils::returned_structseq_repr;
is_initialized = true;
}
return &linalg_inv_exNamedTuple;
}
PyTypeObject* get_linalg_inv_ex_out_namedtuple() {
static PyStructSequence_Field NamedTuple_fields[] = { {"inverse", ""}, {"info", ""}, {nullptr} };
static PyTypeObject linalg_inv_ex_outNamedTuple1;
static bool is_initialized = false;
static PyStructSequence_Desc desc = { "torch.return_types.linalg_inv_ex_out", nullptr, NamedTuple_fields, 2 };
if (!is_initialized) {
PyStructSequence_InitType(&linalg_inv_ex_outNamedTuple1, &desc);
linalg_inv_ex_outNamedTuple1.tp_repr = (reprfunc)torch::utils::returned_structseq_repr;
is_initialized = true;
}
return &linalg_inv_ex_outNamedTuple1;
}
PyTypeObject* get_linalg_lstsq_namedtuple() {
static PyStructSequence_Field NamedTuple_fields[] = { {"solution", ""}, {"residuals", ""}, {"rank", ""}, {"singular_values", ""}, {nullptr} };
static PyTypeObject linalg_lstsqNamedTuple;
static bool is_initialized = false;
static PyStructSequence_Desc desc = { "torch.return_types.linalg_lstsq", nullptr, NamedTuple_fields, 4 };
if (!is_initialized) {
PyStructSequence_InitType(&linalg_lstsqNamedTuple, &desc);
linalg_lstsqNamedTuple.tp_repr = (reprfunc)torch::utils::returned_structseq_repr;
is_initialized = true;
}
return &linalg_lstsqNamedTuple;
}
PyTypeObject* get_linalg_lstsq_out_namedtuple() {
static PyStructSequence_Field NamedTuple_fields[] = { {"solution", ""}, {"residuals", ""}, {"rank", ""}, {"singular_values", ""}, {nullptr} };
static PyTypeObject linalg_lstsq_outNamedTuple1;
static bool is_initialized = false;
static PyStructSequence_Desc desc = { "torch.return_types.linalg_lstsq_out", nullptr, NamedTuple_fields, 4 };
if (!is_initialized) {
PyStructSequence_InitType(&linalg_lstsq_outNamedTuple1, &desc);
linalg_lstsq_outNamedTuple1.tp_repr = (reprfunc)torch::utils::returned_structseq_repr;
is_initialized = true;
}
return &linalg_lstsq_outNamedTuple1;
}
PyTypeObject* get_linalg_qr_namedtuple() {
static PyStructSequence_Field NamedTuple_fields[] = { {"Q", ""}, {"R", ""}, {nullptr} };
static PyTypeObject linalg_qrNamedTuple;
static bool is_initialized = false;
static PyStructSequence_Desc desc = { "torch.return_types.linalg_qr", nullptr, NamedTuple_fields, 2 };
if (!is_initialized) {
PyStructSequence_InitType(&linalg_qrNamedTuple, &desc);
linalg_qrNamedTuple.tp_repr = (reprfunc)torch::utils::returned_structseq_repr;
is_initialized = true;
}
return &linalg_qrNamedTuple;
}
PyTypeObject* get_linalg_qr_out_namedtuple() {
static PyStructSequence_Field NamedTuple_fields[] = { {"Q", ""}, {"R", ""}, {nullptr} };
static PyTypeObject linalg_qr_outNamedTuple1;
static bool is_initialized = false;
static PyStructSequence_Desc desc = { "torch.return_types.linalg_qr_out", nullptr, NamedTuple_fields, 2 };
if (!is_initialized) {
PyStructSequence_InitType(&linalg_qr_outNamedTuple1, &desc);
linalg_qr_outNamedTuple1.tp_repr = (reprfunc)torch::utils::returned_structseq_repr;
is_initialized = true;
}
return &linalg_qr_outNamedTuple1;
}
PyTypeObject* get_linalg_slogdet_namedtuple() {
static PyStructSequence_Field NamedTuple_fields[] = { {"sign", ""}, {"logabsdet", ""}, {nullptr} };
static PyTypeObject linalg_slogdetNamedTuple;
static bool is_initialized = false;
static PyStructSequence_Desc desc = { "torch.return_types.linalg_slogdet", nullptr, NamedTuple_fields, 2 };
if (!is_initialized) {
PyStructSequence_InitType(&linalg_slogdetNamedTuple, &desc);
linalg_slogdetNamedTuple.tp_repr = (reprfunc)torch::utils::returned_structseq_repr;
is_initialized = true;
}
return &linalg_slogdetNamedTuple;
}
PyTypeObject* get_linalg_slogdet_out_namedtuple() {
static PyStructSequence_Field NamedTuple_fields[] = { {"sign", ""}, {"logabsdet", ""}, {nullptr} };
static PyTypeObject linalg_slogdet_outNamedTuple1;
static bool is_initialized = false;
static PyStructSequence_Desc desc = { "torch.return_types.linalg_slogdet_out", nullptr, NamedTuple_fields, 2 };
if (!is_initialized) {
PyStructSequence_InitType(&linalg_slogdet_outNamedTuple1, &desc);
linalg_slogdet_outNamedTuple1.tp_repr = (reprfunc)torch::utils::returned_structseq_repr;
is_initialized = true;
}
return &linalg_slogdet_outNamedTuple1;
}
PyTypeObject* get_linalg_svd_out_namedtuple() {
static PyStructSequence_Field NamedTuple_fields[] = { {"U", ""}, {"S", ""}, {"Vh", ""}, {nullptr} };
static PyTypeObject linalg_svd_outNamedTuple;
static bool is_initialized = false;
static PyStructSequence_Desc desc = { "torch.return_types.linalg_svd_out", nullptr, NamedTuple_fields, 3 };
if (!is_initialized) {
PyStructSequence_InitType(&linalg_svd_outNamedTuple, &desc);
linalg_svd_outNamedTuple.tp_repr = (reprfunc)torch::utils::returned_structseq_repr;
is_initialized = true;
}
return &linalg_svd_outNamedTuple;
}
PyTypeObject* get_linalg_svd_namedtuple() {
static PyStructSequence_Field NamedTuple_fields[] = { {"U", ""}, {"S", ""}, {"Vh", ""}, {nullptr} };
static PyTypeObject linalg_svdNamedTuple1;
static bool is_initialized = false;
static PyStructSequence_Desc desc = { "torch.return_types.linalg_svd", nullptr, NamedTuple_fields, 3 };
if (!is_initialized) {
PyStructSequence_InitType(&linalg_svdNamedTuple1, &desc);
linalg_svdNamedTuple1.tp_repr = (reprfunc)torch::utils::returned_structseq_repr;
is_initialized = true;
}
return &linalg_svdNamedTuple1;
}
PyTypeObject* get_lstsq_out_namedtuple() {
static PyStructSequence_Field NamedTuple_fields[] = { {"solution", ""}, {"QR", ""}, {nullptr} };
static PyTypeObject lstsq_outNamedTuple;
static bool is_initialized = false;
static PyStructSequence_Desc desc = { "torch.return_types.lstsq_out", nullptr, NamedTuple_fields, 2 };
if (!is_initialized) {
PyStructSequence_InitType(&lstsq_outNamedTuple, &desc);
lstsq_outNamedTuple.tp_repr = (reprfunc)torch::utils::returned_structseq_repr;
is_initialized = true;
}
return &lstsq_outNamedTuple;
}
PyTypeObject* get_lstsq_namedtuple() {
static PyStructSequence_Field NamedTuple_fields[] = { {"solution", ""}, {"QR", ""}, {nullptr} };
static PyTypeObject lstsqNamedTuple1;
static bool is_initialized = false;
static PyStructSequence_Desc desc = { "torch.return_types.lstsq", nullptr, NamedTuple_fields, 2 };
if (!is_initialized) {
PyStructSequence_InitType(&lstsqNamedTuple1, &desc);
lstsqNamedTuple1.tp_repr = (reprfunc)torch::utils::returned_structseq_repr;
is_initialized = true;
}
return &lstsqNamedTuple1;
}
PyTypeObject* get_lu_unpack_namedtuple() {
static PyStructSequence_Field NamedTuple_fields[] = { {"P", ""}, {"L", ""}, {"U", ""}, {nullptr} };
static PyTypeObject lu_unpackNamedTuple;
static bool is_initialized = false;
static PyStructSequence_Desc desc = { "torch.return_types.lu_unpack", nullptr, NamedTuple_fields, 3 };
if (!is_initialized) {
PyStructSequence_InitType(&lu_unpackNamedTuple, &desc);
lu_unpackNamedTuple.tp_repr = (reprfunc)torch::utils::returned_structseq_repr;
is_initialized = true;
}
return &lu_unpackNamedTuple;
}
PyTypeObject* get_lu_unpack_out_namedtuple() {
static PyStructSequence_Field NamedTuple_fields[] = { {"P", ""}, {"L", ""}, {"U", ""}, {nullptr} };
static PyTypeObject lu_unpack_outNamedTuple1;
static bool is_initialized = false;
static PyStructSequence_Desc desc = { "torch.return_types.lu_unpack_out", nullptr, NamedTuple_fields, 3 };
if (!is_initialized) {
PyStructSequence_InitType(&lu_unpack_outNamedTuple1, &desc);
lu_unpack_outNamedTuple1.tp_repr = (reprfunc)torch::utils::returned_structseq_repr;
is_initialized = true;
}
return &lu_unpack_outNamedTuple1;
}
PyTypeObject* get_max_namedtuple() {
static PyStructSequence_Field NamedTuple_fields[] = { {"values", ""}, {"indices", ""}, {nullptr} };
static PyTypeObject maxNamedTuple;
static bool is_initialized = false;
static PyStructSequence_Desc desc = { "torch.return_types.max", nullptr, NamedTuple_fields, 2 };
if (!is_initialized) {
PyStructSequence_InitType(&maxNamedTuple, &desc);
maxNamedTuple.tp_repr = (reprfunc)torch::utils::returned_structseq_repr;
is_initialized = true;
}
return &maxNamedTuple;
}
PyTypeObject* get_max_out_namedtuple() {
static PyStructSequence_Field NamedTuple_fields[] = { {"values", ""}, {"indices", ""}, {nullptr} };
static PyTypeObject max_outNamedTuple1;
static bool is_initialized = false;
static PyStructSequence_Desc desc = { "torch.return_types.max_out", nullptr, NamedTuple_fields, 2 };
if (!is_initialized) {
PyStructSequence_InitType(&max_outNamedTuple1, &desc);
max_outNamedTuple1.tp_repr = (reprfunc)torch::utils::returned_structseq_repr;
is_initialized = true;
}
return &max_outNamedTuple1;
}
PyTypeObject* get_median_namedtuple() {
static PyStructSequence_Field NamedTuple_fields[] = { {"values", ""}, {"indices", ""}, {nullptr} };
static PyTypeObject medianNamedTuple;
static bool is_initialized = false;
static PyStructSequence_Desc desc = { "torch.return_types.median", nullptr, NamedTuple_fields, 2 };
if (!is_initialized) {
PyStructSequence_InitType(&medianNamedTuple, &desc);
medianNamedTuple.tp_repr = (reprfunc)torch::utils::returned_structseq_repr;
is_initialized = true;
}
return &medianNamedTuple;
}
PyTypeObject* get_median_out_namedtuple() {
static PyStructSequence_Field NamedTuple_fields[] = { {"values", ""}, {"indices", ""}, {nullptr} };
static PyTypeObject median_outNamedTuple1;
static bool is_initialized = false;
static PyStructSequence_Desc desc = { "torch.return_types.median_out", nullptr, NamedTuple_fields, 2 };
if (!is_initialized) {
PyStructSequence_InitType(&median_outNamedTuple1, &desc);
median_outNamedTuple1.tp_repr = (reprfunc)torch::utils::returned_structseq_repr;
is_initialized = true;
}
return &median_outNamedTuple1;
}
PyTypeObject* get_min_namedtuple() {
static PyStructSequence_Field NamedTuple_fields[] = { {"values", ""}, {"indices", ""}, {nullptr} };
static PyTypeObject minNamedTuple;
static bool is_initialized = false;
static PyStructSequence_Desc desc = { "torch.return_types.min", nullptr, NamedTuple_fields, 2 };
if (!is_initialized) {
PyStructSequence_InitType(&minNamedTuple, &desc);
minNamedTuple.tp_repr = (reprfunc)torch::utils::returned_structseq_repr;
is_initialized = true;
}
return &minNamedTuple;
}
PyTypeObject* get_min_out_namedtuple() {
static PyStructSequence_Field NamedTuple_fields[] = { {"values", ""}, {"indices", ""}, {nullptr} };
static PyTypeObject min_outNamedTuple1;
static bool is_initialized = false;
static PyStructSequence_Desc desc = { "torch.return_types.min_out", nullptr, NamedTuple_fields, 2 };
if (!is_initialized) {
PyStructSequence_InitType(&min_outNamedTuple1, &desc);
min_outNamedTuple1.tp_repr = (reprfunc)torch::utils::returned_structseq_repr;
is_initialized = true;
}
return &min_outNamedTuple1;
}
PyTypeObject* get_mode_namedtuple() {
static PyStructSequence_Field NamedTuple_fields[] = { {"values", ""}, {"indices", ""}, {nullptr} };
static PyTypeObject modeNamedTuple;
static bool is_initialized = false;
static PyStructSequence_Desc desc = { "torch.return_types.mode", nullptr, NamedTuple_fields, 2 };
if (!is_initialized) {
PyStructSequence_InitType(&modeNamedTuple, &desc);
modeNamedTuple.tp_repr = (reprfunc)torch::utils::returned_structseq_repr;
is_initialized = true;
}
return &modeNamedTuple;
}
PyTypeObject* get_mode_out_namedtuple() {
static PyStructSequence_Field NamedTuple_fields[] = { {"values", ""}, {"indices", ""}, {nullptr} };
static PyTypeObject mode_outNamedTuple1;
static bool is_initialized = false;
static PyStructSequence_Desc desc = { "torch.return_types.mode_out", nullptr, NamedTuple_fields, 2 };
if (!is_initialized) {
PyStructSequence_InitType(&mode_outNamedTuple1, &desc);
mode_outNamedTuple1.tp_repr = (reprfunc)torch::utils::returned_structseq_repr;
is_initialized = true;
}
return &mode_outNamedTuple1;
}
PyTypeObject* get_nanmedian_namedtuple() {
static PyStructSequence_Field NamedTuple_fields[] = { {"values", ""}, {"indices", ""}, {nullptr} };
static PyTypeObject nanmedianNamedTuple;
static bool is_initialized = false;
static PyStructSequence_Desc desc = { "torch.return_types.nanmedian", nullptr, NamedTuple_fields, 2 };
if (!is_initialized) {
PyStructSequence_InitType(&nanmedianNamedTuple, &desc);
nanmedianNamedTuple.tp_repr = (reprfunc)torch::utils::returned_structseq_repr;
is_initialized = true;
}
return &nanmedianNamedTuple;
}
PyTypeObject* get_nanmedian_out_namedtuple() {
static PyStructSequence_Field NamedTuple_fields[] = { {"values", ""}, {"indices", ""}, {nullptr} };
static PyTypeObject nanmedian_outNamedTuple1;
static bool is_initialized = false;
static PyStructSequence_Desc desc = { "torch.return_types.nanmedian_out", nullptr, NamedTuple_fields, 2 };
if (!is_initialized) {
PyStructSequence_InitType(&nanmedian_outNamedTuple1, &desc);
nanmedian_outNamedTuple1.tp_repr = (reprfunc)torch::utils::returned_structseq_repr;
is_initialized = true;
}
return &nanmedian_outNamedTuple1;
}
PyTypeObject* get_qr_out_namedtuple() {
static PyStructSequence_Field NamedTuple_fields[] = { {"Q", ""}, {"R", ""}, {nullptr} };
static PyTypeObject qr_outNamedTuple;
static bool is_initialized = false;
static PyStructSequence_Desc desc = { "torch.return_types.qr_out", nullptr, NamedTuple_fields, 2 };
if (!is_initialized) {
PyStructSequence_InitType(&qr_outNamedTuple, &desc);
qr_outNamedTuple.tp_repr = (reprfunc)torch::utils::returned_structseq_repr;
is_initialized = true;
}
return &qr_outNamedTuple;
}
PyTypeObject* get_qr_namedtuple() {
static PyStructSequence_Field NamedTuple_fields[] = { {"Q", ""}, {"R", ""}, {nullptr} };
static PyTypeObject qrNamedTuple1;
static bool is_initialized = false;
static PyStructSequence_Desc desc = { "torch.return_types.qr", nullptr, NamedTuple_fields, 2 };
if (!is_initialized) {
PyStructSequence_InitType(&qrNamedTuple1, &desc);
qrNamedTuple1.tp_repr = (reprfunc)torch::utils::returned_structseq_repr;
is_initialized = true;
}
return &qrNamedTuple1;
}
PyTypeObject* get_slogdet_namedtuple() {
static PyStructSequence_Field NamedTuple_fields[] = { {"sign", ""}, {"logabsdet", ""}, {nullptr} };
static PyTypeObject slogdetNamedTuple;
static bool is_initialized = false;
static PyStructSequence_Desc desc = { "torch.return_types.slogdet", nullptr, NamedTuple_fields, 2 };
if (!is_initialized) {
PyStructSequence_InitType(&slogdetNamedTuple, &desc);
slogdetNamedTuple.tp_repr = (reprfunc)torch::utils::returned_structseq_repr;
is_initialized = true;
}
return &slogdetNamedTuple;
}
PyTypeObject* get_solve_namedtuple() {
static PyStructSequence_Field NamedTuple_fields[] = { {"solution", ""}, {"LU", ""}, {nullptr} };
static PyTypeObject solveNamedTuple;
static bool is_initialized = false;
static PyStructSequence_Desc desc = { "torch.return_types.solve", nullptr, NamedTuple_fields, 2 };
if (!is_initialized) {
PyStructSequence_InitType(&solveNamedTuple, &desc);
solveNamedTuple.tp_repr = (reprfunc)torch::utils::returned_structseq_repr;
is_initialized = true;
}
return &solveNamedTuple;
}
PyTypeObject* get_solve_out_namedtuple() {
static PyStructSequence_Field NamedTuple_fields[] = { {"solution", ""}, {"LU", ""}, {nullptr} };
static PyTypeObject solve_outNamedTuple1;
static bool is_initialized = false;
static PyStructSequence_Desc desc = { "torch.return_types.solve_out", nullptr, NamedTuple_fields, 2 };
if (!is_initialized) {
PyStructSequence_InitType(&solve_outNamedTuple1, &desc);
solve_outNamedTuple1.tp_repr = (reprfunc)torch::utils::returned_structseq_repr;
is_initialized = true;
}
return &solve_outNamedTuple1;
}
PyTypeObject* get_sort_out_namedtuple() {
static PyStructSequence_Field NamedTuple_fields[] = { {"values", ""}, {"indices", ""}, {nullptr} };
static PyTypeObject sort_outNamedTuple;
static bool is_initialized = false;
static PyStructSequence_Desc desc = { "torch.return_types.sort_out", nullptr, NamedTuple_fields, 2 };
if (!is_initialized) {
PyStructSequence_InitType(&sort_outNamedTuple, &desc);
sort_outNamedTuple.tp_repr = (reprfunc)torch::utils::returned_structseq_repr;
is_initialized = true;
}
return &sort_outNamedTuple;
}
PyTypeObject* get_sort_namedtuple() {
static PyStructSequence_Field NamedTuple_fields[] = { {"values", ""}, {"indices", ""}, {nullptr} };
static PyTypeObject sortNamedTuple1;
static bool is_initialized = false;
static PyStructSequence_Desc desc = { "torch.return_types.sort", nullptr, NamedTuple_fields, 2 };
if (!is_initialized) {
PyStructSequence_InitType(&sortNamedTuple1, &desc);
sortNamedTuple1.tp_repr = (reprfunc)torch::utils::returned_structseq_repr;
is_initialized = true;
}
return &sortNamedTuple1;
}
PyTypeObject* get_svd_out_namedtuple() {
static PyStructSequence_Field NamedTuple_fields[] = { {"U", ""}, {"S", ""}, {"V", ""}, {nullptr} };
static PyTypeObject svd_outNamedTuple;
static bool is_initialized = false;
static PyStructSequence_Desc desc = { "torch.return_types.svd_out", nullptr, NamedTuple_fields, 3 };
if (!is_initialized) {
PyStructSequence_InitType(&svd_outNamedTuple, &desc);
svd_outNamedTuple.tp_repr = (reprfunc)torch::utils::returned_structseq_repr;
is_initialized = true;
}
return &svd_outNamedTuple;
}
PyTypeObject* get_svd_namedtuple() {
static PyStructSequence_Field NamedTuple_fields[] = { {"U", ""}, {"S", ""}, {"V", ""}, {nullptr} };
static PyTypeObject svdNamedTuple1;
static bool is_initialized = false;
static PyStructSequence_Desc desc = { "torch.return_types.svd", nullptr, NamedTuple_fields, 3 };
if (!is_initialized) {
PyStructSequence_InitType(&svdNamedTuple1, &desc);
svdNamedTuple1.tp_repr = (reprfunc)torch::utils::returned_structseq_repr;
is_initialized = true;
}
return &svdNamedTuple1;
}
PyTypeObject* get_symeig_out_namedtuple() {
static PyStructSequence_Field NamedTuple_fields[] = { {"eigenvalues", ""}, {"eigenvectors", ""}, {nullptr} };
static PyTypeObject symeig_outNamedTuple;
static bool is_initialized = false;
static PyStructSequence_Desc desc = { "torch.return_types.symeig_out", nullptr, NamedTuple_fields, 2 };
if (!is_initialized) {
PyStructSequence_InitType(&symeig_outNamedTuple, &desc);
symeig_outNamedTuple.tp_repr = (reprfunc)torch::utils::returned_structseq_repr;
is_initialized = true;
}
return &symeig_outNamedTuple;
}
PyTypeObject* get_symeig_namedtuple() {
static PyStructSequence_Field NamedTuple_fields[] = { {"eigenvalues", ""}, {"eigenvectors", ""}, {nullptr} };
static PyTypeObject symeigNamedTuple1;
static bool is_initialized = false;
static PyStructSequence_Desc desc = { "torch.return_types.symeig", nullptr, NamedTuple_fields, 2 };
if (!is_initialized) {
PyStructSequence_InitType(&symeigNamedTuple1, &desc);
symeigNamedTuple1.tp_repr = (reprfunc)torch::utils::returned_structseq_repr;
is_initialized = true;
}
return &symeigNamedTuple1;
}
PyTypeObject* get_topk_out_namedtuple() {
static PyStructSequence_Field NamedTuple_fields[] = { {"values", ""}, {"indices", ""}, {nullptr} };
static PyTypeObject topk_outNamedTuple;
static bool is_initialized = false;
static PyStructSequence_Desc desc = { "torch.return_types.topk_out", nullptr, NamedTuple_fields, 2 };
if (!is_initialized) {
PyStructSequence_InitType(&topk_outNamedTuple, &desc);
topk_outNamedTuple.tp_repr = (reprfunc)torch::utils::returned_structseq_repr;
is_initialized = true;
}
return &topk_outNamedTuple;
}
PyTypeObject* get_topk_namedtuple() {
static PyStructSequence_Field NamedTuple_fields[] = { {"values", ""}, {"indices", ""}, {nullptr} };
static PyTypeObject topkNamedTuple1;
static bool is_initialized = false;
static PyStructSequence_Desc desc = { "torch.return_types.topk", nullptr, NamedTuple_fields, 2 };
if (!is_initialized) {
PyStructSequence_InitType(&topkNamedTuple1, &desc);
topkNamedTuple1.tp_repr = (reprfunc)torch::utils::returned_structseq_repr;
is_initialized = true;
}
return &topkNamedTuple1;
}
PyTypeObject* get_triangular_solve_out_namedtuple() {
static PyStructSequence_Field NamedTuple_fields[] = { {"solution", ""}, {"cloned_coefficient", ""}, {nullptr} };
static PyTypeObject triangular_solve_outNamedTuple;
static bool is_initialized = false;
static PyStructSequence_Desc desc = { "torch.return_types.triangular_solve_out", nullptr, NamedTuple_fields, 2 };
if (!is_initialized) {
PyStructSequence_InitType(&triangular_solve_outNamedTuple, &desc);
triangular_solve_outNamedTuple.tp_repr = (reprfunc)torch::utils::returned_structseq_repr;
is_initialized = true;
}
return &triangular_solve_outNamedTuple;
}
PyTypeObject* get_triangular_solve_namedtuple() {
static PyStructSequence_Field NamedTuple_fields[] = { {"solution", ""}, {"cloned_coefficient", ""}, {nullptr} };
static PyTypeObject triangular_solveNamedTuple1;
static bool is_initialized = false;
static PyStructSequence_Desc desc = { "torch.return_types.triangular_solve", nullptr, NamedTuple_fields, 2 };
if (!is_initialized) {
PyStructSequence_InitType(&triangular_solveNamedTuple1, &desc);
triangular_solveNamedTuple1.tp_repr = (reprfunc)torch::utils::returned_structseq_repr;
is_initialized = true;
}
return &triangular_solveNamedTuple1;
}
}
namespace torch {
namespace autograd {
std::map<std::string, PyTypeObject*>& get_namedtuple_types_map() {
// [NOTE] Non-global map
// This map calls Python functions during its initialization.
// If it is a global static variable and in case it is loaded
// before Python interpreter is ready, then the calls it makes during
// initialization will SEGFAULT.
// To avoid this we make it function static variable so that it is
// initialized only after the Python interpreter is ready.
static std::map<std::string, PyTypeObject*> namedtuple_types_map = {
{"_det_lu_based_helper", get__det_lu_based_helper_namedtuple()},
{"_fake_quantize_per_tensor_affine_cachemask_tensor_qparams", get__fake_quantize_per_tensor_affine_cachemask_tensor_qparams_namedtuple()},
{"_fused_moving_avg_obs_fq_helper", get__fused_moving_avg_obs_fq_helper_namedtuple()},
{"_lu_with_info", get__lu_with_info_namedtuple()},
{"_unpack_dual", get__unpack_dual_namedtuple()},
{"aminmax", get_aminmax_namedtuple()},
{"aminmax_out", get_aminmax_out_namedtuple()},
{"cummax", get_cummax_namedtuple()},
{"cummax_out", get_cummax_out_namedtuple()},
{"cummin", get_cummin_namedtuple()},
{"cummin_out", get_cummin_out_namedtuple()},
{"eig_out", get_eig_out_namedtuple()},
{"eig", get_eig_namedtuple()},
{"frexp", get_frexp_namedtuple()},
{"frexp_out", get_frexp_out_namedtuple()},
{"geqrf_out", get_geqrf_out_namedtuple()},
{"geqrf", get_geqrf_namedtuple()},
{"histogram_out", get_histogram_out_namedtuple()},
{"histogram", get_histogram_namedtuple()},
{"kthvalue", get_kthvalue_namedtuple()},
{"kthvalue_out", get_kthvalue_out_namedtuple()},
{"linalg_cholesky_ex", get_linalg_cholesky_ex_namedtuple()},
{"linalg_cholesky_ex_out", get_linalg_cholesky_ex_out_namedtuple()},
{"linalg_eig", get_linalg_eig_namedtuple()},
{"linalg_eig_out", get_linalg_eig_out_namedtuple()},
{"linalg_eigh", get_linalg_eigh_namedtuple()},
{"linalg_eigh_out", get_linalg_eigh_out_namedtuple()},
{"linalg_inv_ex", get_linalg_inv_ex_namedtuple()},
{"linalg_inv_ex_out", get_linalg_inv_ex_out_namedtuple()},
{"linalg_lstsq", get_linalg_lstsq_namedtuple()},
{"linalg_lstsq_out", get_linalg_lstsq_out_namedtuple()},
{"linalg_qr", get_linalg_qr_namedtuple()},
{"linalg_qr_out", get_linalg_qr_out_namedtuple()},
{"linalg_slogdet", get_linalg_slogdet_namedtuple()},
{"linalg_slogdet_out", get_linalg_slogdet_out_namedtuple()},
{"linalg_svd_out", get_linalg_svd_out_namedtuple()},
{"linalg_svd", get_linalg_svd_namedtuple()},
{"lstsq_out", get_lstsq_out_namedtuple()},
{"lstsq", get_lstsq_namedtuple()},
{"lu_unpack", get_lu_unpack_namedtuple()},
{"lu_unpack_out", get_lu_unpack_out_namedtuple()},
{"max", get_max_namedtuple()},
{"max_out", get_max_out_namedtuple()},
{"median", get_median_namedtuple()},
{"median_out", get_median_out_namedtuple()},
{"min", get_min_namedtuple()},
{"min_out", get_min_out_namedtuple()},
{"mode", get_mode_namedtuple()},
{"mode_out", get_mode_out_namedtuple()},
{"nanmedian", get_nanmedian_namedtuple()},
{"nanmedian_out", get_nanmedian_out_namedtuple()},
{"qr_out", get_qr_out_namedtuple()},
{"qr", get_qr_namedtuple()},
{"slogdet", get_slogdet_namedtuple()},
{"solve", get_solve_namedtuple()},
{"solve_out", get_solve_out_namedtuple()},
{"sort_out", get_sort_out_namedtuple()},
{"sort", get_sort_namedtuple()},
{"svd_out", get_svd_out_namedtuple()},
{"svd", get_svd_namedtuple()},
{"symeig_out", get_symeig_out_namedtuple()},
{"symeig", get_symeig_namedtuple()},
{"topk_out", get_topk_out_namedtuple()},
{"topk", get_topk_namedtuple()},
{"triangular_solve_out", get_triangular_solve_out_namedtuple()},
{"triangular_solve", get_triangular_solve_namedtuple()},
};
return namedtuple_types_map;
}
PyTypeObject* get_namedtuple(std::string name) {
static auto& namedtuple_types_map = get_namedtuple_types_map();
return namedtuple_types_map[name];
}
void initReturnTypes(PyObject* module) {
static struct PyModuleDef def = {
PyModuleDef_HEAD_INIT, "torch._C._return_types", nullptr, -1, {}};
PyObject* return_types_module = PyModule_Create(&def);
if (!return_types_module) {
throw python_error();
}
for (const auto& return_type_pair : get_namedtuple_types_map()) {
// hold onto the TypeObject for the unlikely case of user
// deleting or overriding it.
Py_INCREF(return_type_pair.second);
if (PyModule_AddObject(
return_types_module,
return_type_pair.first.c_str(),
(PyObject*)return_type_pair.second) != 0) {
Py_DECREF((PyObject*)return_type_pair.second);
throw python_error();
}
}
// steals a reference to return_types on success
if (PyModule_AddObject(module, "_return_types", return_types_module) != 0) {
Py_DECREF(return_types_module);
throw python_error();
}
}
} // namespace autograd
} // namespace torch
```
</details>
<details>
<summary>Eg. updated call in other python_*_functions</summary>
```cpp
// linalg_cholesky_ex
static PyObject * THPVariable_linalg_cholesky_ex(PyObject* self_, PyObject* args, PyObject* kwargs)
{
HANDLE_TH_ERRORS
static PyTypeObject* NamedTuple = get_namedtuple("linalg_cholesky_ex");
static PyTypeObject* NamedTuple1 = get_namedtuple("linalg_cholesky_ex_out");
static PythonArgParser parser({
"linalg_cholesky_ex(Tensor input, *, bool upper=False, bool check_errors=False, TensorList[2] out=None)",
}, /*traceable=*/true);
ParsedArgs<4> parsed_args;
auto _r = parser.parse(nullptr, args, kwargs, parsed_args);
if(_r.has_torch_function()) {
return handle_torch_function(_r, nullptr, args, kwargs, THPLinalgVariableFunctionsModule, "torch.linalg");
}
if (_r.isNone(3)) {
// aten::linalg_cholesky_ex(Tensor self, *, bool upper=False, bool check_errors=False) -> (Tensor L, Tensor info)
auto dispatch_linalg_cholesky_ex = [](const at::Tensor & self, bool upper, bool check_errors) -> ::std::tuple<at::Tensor,at::Tensor> {
pybind11::gil_scoped_release no_gil;
return at::linalg_cholesky_ex(self, upper, check_errors);
};
return wrap(NamedTuple, dispatch_linalg_cholesky_ex(_r.tensor(0), _r.toBool(1), _r.toBool(2)));
} else {
// aten::linalg_cholesky_ex.L(Tensor self, *, bool upper=False, bool check_errors=False, Tensor(a!) L, Tensor(b!) info) -> (Tensor(a!) L, Tensor(b!) info)
auto out = _r.tensorlist_n<2>(3);
auto dispatch_linalg_cholesky_ex_out = [](at::Tensor & L, at::Tensor & info, const at::Tensor & self, bool upper, bool check_errors) -> ::std::tuple<at::Tensor,at::Tensor> {
pybind11::gil_scoped_release no_gil;
return at::linalg_cholesky_ex_out(L, info, self, upper, check_errors);
};
return wrap(NamedTuple1, dispatch_linalg_cholesky_ex_out(out[0], out[1], _r.tensor(0), _r.toBool(1), _r.toBool(2)));
}
Py_RETURN_NONE;
END_HANDLE_TH_ERRORS
}
```
</details>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/66614
Reviewed By: H-Huang
Differential Revision: D32741134
Pulled By: zou3519
fbshipit-source-id: 27bada30d20e66333ca1be1775608d9f0cbf9f59
2021-12-06 17:03:11 +00:00
|
|
|
"torch/csrc/autograd/generated/python_return_types.cpp",
|
2020-05-20 05:49:28 +00:00
|
|
|
]
|
|
|
|
|
|
|
|
|
|
genrule(
|
|
|
|
|
name = "all_generated_code",
|
|
|
|
|
srcs = [
|
[pytorch] rewrite of the python binding codegen with the v2 API (#46244)
Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/46244
- What does the generated binding code do?
The Python binding codegen produces code that takes the input list of
PyObjects, finds the matching ATen C++ function using PythonArgParser,
converts the PyObjects into C++ types and calls the ATen C++ function:
```
+--------+ parsing +------------------------+ binding +-----------------------+
| PyObjs | ---------> | PythonArgParser Output | ---------> | Cpp Function Dispatch |
+--------+ +------------------------+ +-----------------------+
```
- Are Python arguments 1-1 mapped to C++ arguments?
Python arguments might be reordered, packed, unpacked when binding to
C++ arguments, as illustrated below:
```
// Binding - Reorder & Packing
// aten::empty.names(int[] size, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None,
Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
Python Args Cpp Args
-----------------------------------------------------------
0: size size
1: names names
2: memory_format -------+
3: dtype -----+-|--> options
4: layout / |
5: device / +--> memory_format
6: pin_memory /
7: requires_grad -+
// Binding - Unpacking
// aten::max.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)
Python Args Cpp Args
-----------------------------------------------------------
+----> max
/-----> max_values
0: input / self
1: dim / dim
2: keepdim / keepdim
3: out -----+
```
- Why do we want to rewrite the python binding codegen?
The old codegen takes Declarations.yaml as input. It doesn't distinguish
between Python arguments and C++ arguments - they are all mixed together
as a bag of non-typed dict objects. Different methods process these arg
objects and add new attributes for various different purposes. It's not so
obvious to figure out the semantics of these attributes. The complicated
binding logic happens implicitly and scatteredly.
```
+--------------------+
| Native Functions |
+--------------------+
|
|
v
+--------------------+
| Cpp Signatures |
+--------------------+
|
|
v
+--------------------+
| Declarations.yaml |
+--------------------+
| +-------------------------------------+
| +-------> | PythonArgParser Schema |
| | +-------------------------------------+
| | .
| | .
v | .
+--------------------+ +-------------------------------------+
| NonTyped Args Objs | --> | PythonArgParser -> Cpp Args Binding |
+--------------------+ +-------------------------------------+
| .
| .
| .
| +-------------------------------------+
+-------> | Cpp Function Dispatch |
+-------------------------------------+
```
This PR leverages the new immutable data models introduced in the new
aten codegen. It introduces dedicated data models for python schema.
This way, we can not only avoid subtle Declaration.yaml conversions but
also decouple the generation of python schema, python to c++ binding and
c++ function call.
The ultimate state will be like the following diagram:
```
+-------------------+ +-------------------------------------+
+-------> | Python Signatures | --> | PythonArgParser Schema |
| +-------------------+ +-------------------------------------+
| | .
| | .
| | .
+------------------+ | +-------------------------------------+
| Native Functions | +-------> | PythonArgParser -> Cpp Args Binding |
+------------------+ | +-------------------------------------+
| | .
| | .
| | .
| +-------------------+ +-------------------------------------+
+-------> | Cpp Signatures | --> | Cpp Function Dispatch |
+-------------------+ +-------------------------------------+
```
This PR has migrated the core binding logic from
tools/autograd/gen_python_functions.py to tools/codegen/api/python.py.
It produces the byte-for-byte same results (tested with #46243).
Will migrate the rest of gen_python_functions.py in subsequent PRs.
Test Plan: Imported from OSS
Reviewed By: bhosmer
Differential Revision: D24388874
Pulled By: ljk53
fbshipit-source-id: f88b6df4e917cf90d868a2bbae2d5ffb680d1841
2020-10-20 00:34:45 +00:00
|
|
|
"aten/src/ATen/native/native_functions.yaml",
|
2020-04-07 05:48:33 +00:00
|
|
|
],
|
2020-05-20 05:49:28 +00:00
|
|
|
outs = libtorch_cpp_generated_sources + libtorch_python_generated_sources,
|
2021-11-03 20:17:49 +00:00
|
|
|
cmd = "$(location :generate_code) --install_dir `dirname $(location torch/csrc/autograd/generated/variable_factories.h)`/../.. --native-functions-path $(location aten/src/ATen/native/native_functions.yaml) --nn-path aten/src",
|
2020-04-07 05:48:33 +00:00
|
|
|
tools = [":generate_code"],
|
|
|
|
|
)
|
|
|
|
|
|
2020-05-20 05:49:28 +00:00
|
|
|
filegroup(
|
|
|
|
|
name = "cpp_generated_code",
|
|
|
|
|
data = [":all_generated_code"],
|
|
|
|
|
srcs = libtorch_cpp_generated_sources,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
filegroup(
|
|
|
|
|
name = "python_generated_code",
|
|
|
|
|
data = [":all_generated_code"],
|
|
|
|
|
srcs = libtorch_python_generated_sources,
|
|
|
|
|
)
|
|
|
|
|
|
2020-04-07 05:48:33 +00:00
|
|
|
exports_files(
|
|
|
|
|
srcs = ["aten/src/ATen/cpu/tbb/extra/version_string.ver.in"],
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# ATen
|
|
|
|
|
filegroup(
|
|
|
|
|
name = "aten_base_cpp",
|
|
|
|
|
srcs = glob([
|
|
|
|
|
"aten/src/ATen/*.cpp",
|
|
|
|
|
"aten/src/ATen/detail/*.cpp",
|
|
|
|
|
"aten/src/ATen/cpu/*.cpp",
|
|
|
|
|
]),
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
filegroup(
|
|
|
|
|
name = "ATen_CORE_SRCS",
|
|
|
|
|
srcs = glob(
|
|
|
|
|
[
|
|
|
|
|
"aten/src/ATen/core/**/*.cpp",
|
|
|
|
|
],
|
|
|
|
|
exclude = [
|
|
|
|
|
"aten/src/ATen/core/**/*_test.cpp",
|
|
|
|
|
],
|
|
|
|
|
),
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
filegroup(
|
|
|
|
|
name = "aten_native_cpp",
|
|
|
|
|
srcs = glob(["aten/src/ATen/native/*.cpp"]),
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
filegroup(
|
|
|
|
|
name = "aten_native_sparse_cpp",
|
|
|
|
|
srcs = glob(["aten/src/ATen/native/sparse/*.cpp"]),
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
filegroup(
|
|
|
|
|
name = "aten_native_quantized_cpp",
|
|
|
|
|
srcs = glob(
|
|
|
|
|
[
|
|
|
|
|
"aten/src/ATen/native/quantized/*.cpp",
|
|
|
|
|
"aten/src/ATen/native/quantized/cpu/*.cpp",
|
|
|
|
|
],
|
|
|
|
|
),
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
filegroup(
|
|
|
|
|
name = "aten_native_mkl_cpp",
|
|
|
|
|
srcs = glob(["aten/src/ATen/native/mkl/*.cpp"]),
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
filegroup(
|
|
|
|
|
name = "aten_native_mkldnn_cpp",
|
|
|
|
|
srcs = glob(["aten/src/ATen/native/mkldnn/*.cpp"]),
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
filegroup(
|
|
|
|
|
name = "aten_native_xnnpack",
|
|
|
|
|
srcs = glob(["aten/src/ATen/native/xnnpack/*.cpp"]),
|
|
|
|
|
)
|
|
|
|
|
|
2020-05-26 18:20:51 +00:00
|
|
|
filegroup(
|
2020-08-07 16:04:01 +00:00
|
|
|
name = "aten_base_vulkan",
|
|
|
|
|
srcs = glob(["aten/src/ATen/vulkan/*.cpp"]),
|
2020-05-26 18:20:51 +00:00
|
|
|
)
|
|
|
|
|
|
2020-10-13 08:44:36 +00:00
|
|
|
filegroup(
|
|
|
|
|
name = "aten_base_metal",
|
|
|
|
|
srcs = glob(["aten/src/ATen/metal/*.cpp"]),
|
|
|
|
|
)
|
|
|
|
|
|
2020-04-07 05:48:33 +00:00
|
|
|
filegroup(
|
|
|
|
|
name = "ATen_QUANTIZED_SRCS",
|
|
|
|
|
srcs = glob(
|
|
|
|
|
[
|
|
|
|
|
"aten/src/ATen/quantized/**/*.cpp",
|
|
|
|
|
],
|
|
|
|
|
exclude = [
|
|
|
|
|
"aten/src/ATen/quantized/**/*_test.cpp",
|
|
|
|
|
],
|
|
|
|
|
),
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
filegroup(
|
|
|
|
|
name = "aten_cuda_srcs",
|
|
|
|
|
srcs = [
|
|
|
|
|
"aten/src/ATen/cuda/CUDABlas.cpp",
|
Add cusolver to build, rewrite MAGMA inverse with cusolver (#42403)
Summary:
Fixes https://github.com/pytorch/pytorch/issues/42265
This PR adds cusolver to the pytorch build, and enables the use of cusolver/cublas library functions on GPU `torch.inverse` on certain tensor shapes.
Specifically, when
* the tensor is two dimensional (single batch), or
* has >2 dimensions (multiple batches) and `batch_size <= 2`, or
* magma is not linked,
cusolver/cublas will be used. In other conditions, the current implementation of MAGMA will still be used.
https://github.com/pytorch/pytorch/blob/8c0949ae454b1d2c1b626a5ea19ba5ea6487d305/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu#L742-L752
The reason for this is that for tensors with large batch_size, `cublasXgetrfBatched` and `cublasXgetriBatched` doesn't perform very well. For `batch_size > 1`, we launch cusolver functions in multiple streams. This lets cusolver functions run in parallel, and can greatly increase the performance. When `batch_size > 2`, the parallel launched cusolver functions are slightly slower than the current magma implementation, so we still use the current magma impl.
On CUDA 9.2, there were some numerical issues detected, so cusolver impl will not be used. The cusolver impl will also not be used on platforms other than Nvidia CUDA.
https://github.com/pytorch/pytorch/blob/060769feaf02db56ac79e0c73dab1105828ece69/aten/src/ATen/native/cuda/BatchLinearAlgebraLib.h#L10-L13
Note that there is a new heuristic used before cusolver/cublas calls here:
https://github.com/pytorch/pytorch/blob/8c0949ae454b1d2c1b626a5ea19ba5ea6487d305/aten/src/ATen/native/cuda/MiscUtils.h#L113-L121
where `use_loop_launch = true` means launch single batch cusolver functions in parallel, and `use_loop_launch = false` means use cublas_X_batched functions. When magma is enabled (only `batch_size <= 2` will be dispatched to cusolver/cublas), the heuristic will always return `true` and the cusolver calls are faster than small batch_size magma calls. When magma is disabled, this adds the functionality of `torch.inverse`, which was disabled before for all shapes (though large batch_size cublas performance may not be as well as magma).
Checklist:
- [X] Add benchmark, cpu, gpu-before (magma), gpu-after (cusolver)
- [X] Rewrite single inverse (ndim == 2) with cusolver
- [X] Rewrite batched inverse (ndim > 2) with cublas
- [X] Add cusolver to build
- [x] Clean up functions related to `USE_MAGMA` define guard
- [x] Workaround for non-cuda platform
- [x] Workaround for cuda 9.2
- [x] Add zero size check
- [x] Add tests
Next step:
If cusolver doesn't cause any problem in pytorch build, and there are no major performance regressions reported after this PR being merged, I will start porting other cusolver/cublas functions for linear algebra to improve the performance.
<details>
<summary> benchmark 73499c6 </summary>
benchmark code: https://github.com/xwang233/code-snippet/blob/master/torch.inverse/inverse-cusolver.ipynb
shape meaning:
* `[] 2 torch.float32 -> torch.randn(2, 2, dtype=torch.float32)`
* `[2] 4 torch.float32 -> torch.randn(2, 4, 4, dtype=torch.float32)`
| shape | cpu_time (ms) | gpu_time_before (magma) (ms) | gpu_time_after (ms) |
| --- | --- | --- | --- |
| [] 2 torch.float32 | 0.095 | 7.534 | 0.129 |
| [] 4 torch.float32 | 0.009 | 7.522 | 0.129 |
| [] 8 torch.float32 | 0.011 | 7.647 | 0.138 |
| [] 16 torch.float32 | 0.075 | 7.582 | 0.135 |
| [] 32 torch.float32 | 0.073 | 7.573 | 0.191 |
| [] 64 torch.float32 | 0.134 | 7.694 | 0.288 |
| [] 128 torch.float32 | 0.398 | 8.073 | 0.491 |
| [] 256 torch.float32 | 1.054 | 11.860 | 1.074 |
| [] 512 torch.float32 | 5.218 | 14.130 | 2.582 |
| [] 1024 torch.float32 | 19.010 | 18.780 | 6.936 |
| [1] 2 torch.float32 | 0.009 | 0.113 | 0.128 ***regressed |
| [1] 4 torch.float32 | 0.009 | 0.113 | 0.131 ***regressed |
| [1] 8 torch.float32 | 0.011 | 0.116 | 0.129 ***regressed |
| [1] 16 torch.float32 | 0.015 | 0.122 | 0.135 ***regressed |
| [1] 32 torch.float32 | 0.032 | 0.177 | 0.178 ***regressed |
| [1] 64 torch.float32 | 0.070 | 0.420 | 0.281 |
| [1] 128 torch.float32 | 0.328 | 0.816 | 0.490 |
| [1] 256 torch.float32 | 1.125 | 1.690 | 1.084 |
| [1] 512 torch.float32 | 4.344 | 4.305 | 2.576 |
| [1] 1024 torch.float32 | 16.510 | 16.340 | 6.928 |
| [2] 2 torch.float32 | 0.009 | 0.113 | 0.186 ***regressed |
| [2] 4 torch.float32 | 0.011 | 0.115 | 0.184 ***regressed |
| [2] 8 torch.float32 | 0.012 | 0.114 | 0.184 ***regressed |
| [2] 16 torch.float32 | 0.019 | 0.119 | 0.173 ***regressed |
| [2] 32 torch.float32 | 0.050 | 0.170 | 0.240 ***regressed |
| [2] 64 torch.float32 | 0.120 | 0.429 | 0.375 |
| [2] 128 torch.float32 | 0.576 | 0.830 | 0.675 |
| [2] 256 torch.float32 | 2.021 | 1.748 | 1.451 |
| [2] 512 torch.float32 | 9.070 | 4.749 | 3.539 |
| [2] 1024 torch.float32 | 33.655 | 18.240 | 12.220 |
| [4] 2 torch.float32 | 0.009 | 0.112 | 0.318 ***regressed |
| [4] 4 torch.float32 | 0.010 | 0.115 | 0.319 ***regressed |
| [4] 8 torch.float32 | 0.013 | 0.115 | 0.320 ***regressed |
| [4] 16 torch.float32 | 0.027 | 0.120 | 0.331 ***regressed |
| [4] 32 torch.float32 | 0.085 | 0.173 | 0.385 ***regressed |
| [4] 64 torch.float32 | 0.221 | 0.431 | 0.646 ***regressed |
| [4] 128 torch.float32 | 1.102 | 0.834 | 1.055 ***regressed |
| [4] 256 torch.float32 | 4.042 | 1.811 | 2.054 ***regressed |
| [4] 512 torch.float32 | 18.390 | 4.884 | 5.087 ***regressed |
| [4] 1024 torch.float32 | 69.025 | 19.840 | 20.000 ***regressed |
</details>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/42403
Reviewed By: ailzhang, mruberry
Differential Revision: D23717984
Pulled By: ngimel
fbshipit-source-id: 54cbd9ea72a97989cff4127089938e8a8e29a72b
2020-09-19 03:40:39 +00:00
|
|
|
"aten/src/ATen/cuda/CUDASolver.cpp",
|
2020-04-07 05:48:33 +00:00
|
|
|
"aten/src/ATen/cuda/CUDAContext.cpp",
|
2020-12-14 18:49:57 +00:00
|
|
|
"aten/src/ATen/cuda/CUDAGeneratorImpl.cpp",
|
|
|
|
|
"aten/src/ATen/cuda/CUDAGraph.cpp",
|
2020-04-07 05:48:33 +00:00
|
|
|
"aten/src/ATen/cuda/CuSparseHandlePool.cpp",
|
|
|
|
|
"aten/src/ATen/cuda/CublasHandlePool.cpp",
|
Add cusolver to build, rewrite MAGMA inverse with cusolver (#42403)
Summary:
Fixes https://github.com/pytorch/pytorch/issues/42265
This PR adds cusolver to the pytorch build, and enables the use of cusolver/cublas library functions on GPU `torch.inverse` on certain tensor shapes.
Specifically, when
* the tensor is two dimensional (single batch), or
* has >2 dimensions (multiple batches) and `batch_size <= 2`, or
* magma is not linked,
cusolver/cublas will be used. In other conditions, the current implementation of MAGMA will still be used.
https://github.com/pytorch/pytorch/blob/8c0949ae454b1d2c1b626a5ea19ba5ea6487d305/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu#L742-L752
The reason for this is that for tensors with large batch_size, `cublasXgetrfBatched` and `cublasXgetriBatched` doesn't perform very well. For `batch_size > 1`, we launch cusolver functions in multiple streams. This lets cusolver functions run in parallel, and can greatly increase the performance. When `batch_size > 2`, the parallel launched cusolver functions are slightly slower than the current magma implementation, so we still use the current magma impl.
On CUDA 9.2, there were some numerical issues detected, so cusolver impl will not be used. The cusolver impl will also not be used on platforms other than Nvidia CUDA.
https://github.com/pytorch/pytorch/blob/060769feaf02db56ac79e0c73dab1105828ece69/aten/src/ATen/native/cuda/BatchLinearAlgebraLib.h#L10-L13
Note that there is a new heuristic used before cusolver/cublas calls here:
https://github.com/pytorch/pytorch/blob/8c0949ae454b1d2c1b626a5ea19ba5ea6487d305/aten/src/ATen/native/cuda/MiscUtils.h#L113-L121
where `use_loop_launch = true` means launch single batch cusolver functions in parallel, and `use_loop_launch = false` means use cublas_X_batched functions. When magma is enabled (only `batch_size <= 2` will be dispatched to cusolver/cublas), the heuristic will always return `true` and the cusolver calls are faster than small batch_size magma calls. When magma is disabled, this adds the functionality of `torch.inverse`, which was disabled before for all shapes (though large batch_size cublas performance may not be as well as magma).
Checklist:
- [X] Add benchmark, cpu, gpu-before (magma), gpu-after (cusolver)
- [X] Rewrite single inverse (ndim == 2) with cusolver
- [X] Rewrite batched inverse (ndim > 2) with cublas
- [X] Add cusolver to build
- [x] Clean up functions related to `USE_MAGMA` define guard
- [x] Workaround for non-cuda platform
- [x] Workaround for cuda 9.2
- [x] Add zero size check
- [x] Add tests
Next step:
If cusolver doesn't cause any problem in pytorch build, and there are no major performance regressions reported after this PR being merged, I will start porting other cusolver/cublas functions for linear algebra to improve the performance.
<details>
<summary> benchmark 73499c6 </summary>
benchmark code: https://github.com/xwang233/code-snippet/blob/master/torch.inverse/inverse-cusolver.ipynb
shape meaning:
* `[] 2 torch.float32 -> torch.randn(2, 2, dtype=torch.float32)`
* `[2] 4 torch.float32 -> torch.randn(2, 4, 4, dtype=torch.float32)`
| shape | cpu_time (ms) | gpu_time_before (magma) (ms) | gpu_time_after (ms) |
| --- | --- | --- | --- |
| [] 2 torch.float32 | 0.095 | 7.534 | 0.129 |
| [] 4 torch.float32 | 0.009 | 7.522 | 0.129 |
| [] 8 torch.float32 | 0.011 | 7.647 | 0.138 |
| [] 16 torch.float32 | 0.075 | 7.582 | 0.135 |
| [] 32 torch.float32 | 0.073 | 7.573 | 0.191 |
| [] 64 torch.float32 | 0.134 | 7.694 | 0.288 |
| [] 128 torch.float32 | 0.398 | 8.073 | 0.491 |
| [] 256 torch.float32 | 1.054 | 11.860 | 1.074 |
| [] 512 torch.float32 | 5.218 | 14.130 | 2.582 |
| [] 1024 torch.float32 | 19.010 | 18.780 | 6.936 |
| [1] 2 torch.float32 | 0.009 | 0.113 | 0.128 ***regressed |
| [1] 4 torch.float32 | 0.009 | 0.113 | 0.131 ***regressed |
| [1] 8 torch.float32 | 0.011 | 0.116 | 0.129 ***regressed |
| [1] 16 torch.float32 | 0.015 | 0.122 | 0.135 ***regressed |
| [1] 32 torch.float32 | 0.032 | 0.177 | 0.178 ***regressed |
| [1] 64 torch.float32 | 0.070 | 0.420 | 0.281 |
| [1] 128 torch.float32 | 0.328 | 0.816 | 0.490 |
| [1] 256 torch.float32 | 1.125 | 1.690 | 1.084 |
| [1] 512 torch.float32 | 4.344 | 4.305 | 2.576 |
| [1] 1024 torch.float32 | 16.510 | 16.340 | 6.928 |
| [2] 2 torch.float32 | 0.009 | 0.113 | 0.186 ***regressed |
| [2] 4 torch.float32 | 0.011 | 0.115 | 0.184 ***regressed |
| [2] 8 torch.float32 | 0.012 | 0.114 | 0.184 ***regressed |
| [2] 16 torch.float32 | 0.019 | 0.119 | 0.173 ***regressed |
| [2] 32 torch.float32 | 0.050 | 0.170 | 0.240 ***regressed |
| [2] 64 torch.float32 | 0.120 | 0.429 | 0.375 |
| [2] 128 torch.float32 | 0.576 | 0.830 | 0.675 |
| [2] 256 torch.float32 | 2.021 | 1.748 | 1.451 |
| [2] 512 torch.float32 | 9.070 | 4.749 | 3.539 |
| [2] 1024 torch.float32 | 33.655 | 18.240 | 12.220 |
| [4] 2 torch.float32 | 0.009 | 0.112 | 0.318 ***regressed |
| [4] 4 torch.float32 | 0.010 | 0.115 | 0.319 ***regressed |
| [4] 8 torch.float32 | 0.013 | 0.115 | 0.320 ***regressed |
| [4] 16 torch.float32 | 0.027 | 0.120 | 0.331 ***regressed |
| [4] 32 torch.float32 | 0.085 | 0.173 | 0.385 ***regressed |
| [4] 64 torch.float32 | 0.221 | 0.431 | 0.646 ***regressed |
| [4] 128 torch.float32 | 1.102 | 0.834 | 1.055 ***regressed |
| [4] 256 torch.float32 | 4.042 | 1.811 | 2.054 ***regressed |
| [4] 512 torch.float32 | 18.390 | 4.884 | 5.087 ***regressed |
| [4] 1024 torch.float32 | 69.025 | 19.840 | 20.000 ***regressed |
</details>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/42403
Reviewed By: ailzhang, mruberry
Differential Revision: D23717984
Pulled By: ngimel
fbshipit-source-id: 54cbd9ea72a97989cff4127089938e8a8e29a72b
2020-09-19 03:40:39 +00:00
|
|
|
"aten/src/ATen/cuda/CusolverDnHandlePool.cpp",
|
2020-04-07 05:48:33 +00:00
|
|
|
"aten/src/ATen/cuda/PinnedMemoryAllocator.cpp",
|
|
|
|
|
"aten/src/ATen/cuda/detail/CUDAHooks.cpp",
|
2020-08-18 20:36:02 +00:00
|
|
|
"aten/src/ATen/cudnn/AutocastRNN.cpp",
|
2020-04-07 05:48:33 +00:00
|
|
|
"aten/src/ATen/cudnn/Descriptors.cpp",
|
|
|
|
|
"aten/src/ATen/cudnn/Handle.cpp",
|
|
|
|
|
"aten/src/ATen/cudnn/Types.cpp",
|
|
|
|
|
"aten/src/ATen/native/cuda/CUDAUnaryOps.cpp",
|
|
|
|
|
"aten/src/ATen/native/cuda/TensorShapeCUDA.cpp",
|
|
|
|
|
"aten/src/ATen/native/cudnn/AffineGridGenerator.cpp",
|
|
|
|
|
"aten/src/ATen/native/cudnn/BatchNorm.cpp",
|
|
|
|
|
"aten/src/ATen/native/cudnn/Conv.cpp",
|
|
|
|
|
"aten/src/ATen/native/cudnn/GridSampler.cpp",
|
|
|
|
|
"aten/src/ATen/native/cudnn/LossCTC.cpp",
|
|
|
|
|
"aten/src/ATen/native/cudnn/RNN.cpp",
|
|
|
|
|
"aten/src/ATen/native/miopen/BatchNorm_miopen.cpp",
|
|
|
|
|
"aten/src/ATen/native/miopen/Conv_miopen.cpp",
|
|
|
|
|
"aten/src/ATen/native/miopen/RNN_miopen.cpp",
|
|
|
|
|
"aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cpp",
|
2021-10-13 03:04:55 +00:00
|
|
|
"aten/src/ATen/native/sparse/cuda/SparseBlas.cpp",
|
2021-09-30 18:30:19 +00:00
|
|
|
"aten/src/ATen/native/sparse/cuda/SparseBlasImpl.cpp",
|
2020-04-07 05:48:33 +00:00
|
|
|
],
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
filegroup(
|
|
|
|
|
name = "aten_srcs_cu",
|
|
|
|
|
srcs = [
|
2021-11-12 21:49:40 +00:00
|
|
|
"aten/src/ATen/cuda/cub.cu.cc",
|
2020-04-07 05:48:33 +00:00
|
|
|
"aten/src/ATen/cuda/detail/IndexUtils.cu.cc",
|
2020-12-04 20:33:13 +00:00
|
|
|
"aten/src/ATen/cuda/detail/CUDAGraphsUtils.cu.cc",
|
2020-04-07 05:48:33 +00:00
|
|
|
"aten/src/ATen/native/cuda/Activation.cu.cc",
|
|
|
|
|
"aten/src/ATen/native/cuda/AdaptiveAveragePooling.cu.cc",
|
|
|
|
|
"aten/src/ATen/native/cuda/AdaptiveAveragePooling3d.cu.cc",
|
|
|
|
|
"aten/src/ATen/native/cuda/AdaptiveMaxPooling2d.cu.cc",
|
|
|
|
|
"aten/src/ATen/native/cuda/AdaptiveMaxPooling3d.cu.cc",
|
|
|
|
|
"aten/src/ATen/native/cuda/AveragePool2d.cu.cc",
|
|
|
|
|
"aten/src/ATen/native/cuda/AveragePool3d.cu.cc",
|
|
|
|
|
"aten/src/ATen/native/cuda/BatchLinearAlgebra.cu.cc",
|
Add cusolver to build, rewrite MAGMA inverse with cusolver (#42403)
Summary:
Fixes https://github.com/pytorch/pytorch/issues/42265
This PR adds cusolver to the pytorch build, and enables the use of cusolver/cublas library functions on GPU `torch.inverse` on certain tensor shapes.
Specifically, when
* the tensor is two dimensional (single batch), or
* has >2 dimensions (multiple batches) and `batch_size <= 2`, or
* magma is not linked,
cusolver/cublas will be used. In other conditions, the current implementation of MAGMA will still be used.
https://github.com/pytorch/pytorch/blob/8c0949ae454b1d2c1b626a5ea19ba5ea6487d305/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu#L742-L752
The reason for this is that for tensors with large batch_size, `cublasXgetrfBatched` and `cublasXgetriBatched` doesn't perform very well. For `batch_size > 1`, we launch cusolver functions in multiple streams. This lets cusolver functions run in parallel, and can greatly increase the performance. When `batch_size > 2`, the parallel launched cusolver functions are slightly slower than the current magma implementation, so we still use the current magma impl.
On CUDA 9.2, there were some numerical issues detected, so cusolver impl will not be used. The cusolver impl will also not be used on platforms other than Nvidia CUDA.
https://github.com/pytorch/pytorch/blob/060769feaf02db56ac79e0c73dab1105828ece69/aten/src/ATen/native/cuda/BatchLinearAlgebraLib.h#L10-L13
Note that there is a new heuristic used before cusolver/cublas calls here:
https://github.com/pytorch/pytorch/blob/8c0949ae454b1d2c1b626a5ea19ba5ea6487d305/aten/src/ATen/native/cuda/MiscUtils.h#L113-L121
where `use_loop_launch = true` means launch single batch cusolver functions in parallel, and `use_loop_launch = false` means use cublas_X_batched functions. When magma is enabled (only `batch_size <= 2` will be dispatched to cusolver/cublas), the heuristic will always return `true` and the cusolver calls are faster than small batch_size magma calls. When magma is disabled, this adds the functionality of `torch.inverse`, which was disabled before for all shapes (though large batch_size cublas performance may not be as well as magma).
Checklist:
- [X] Add benchmark, cpu, gpu-before (magma), gpu-after (cusolver)
- [X] Rewrite single inverse (ndim == 2) with cusolver
- [X] Rewrite batched inverse (ndim > 2) with cublas
- [X] Add cusolver to build
- [x] Clean up functions related to `USE_MAGMA` define guard
- [x] Workaround for non-cuda platform
- [x] Workaround for cuda 9.2
- [x] Add zero size check
- [x] Add tests
Next step:
If cusolver doesn't cause any problem in pytorch build, and there are no major performance regressions reported after this PR being merged, I will start porting other cusolver/cublas functions for linear algebra to improve the performance.
<details>
<summary> benchmark 73499c6 </summary>
benchmark code: https://github.com/xwang233/code-snippet/blob/master/torch.inverse/inverse-cusolver.ipynb
shape meaning:
* `[] 2 torch.float32 -> torch.randn(2, 2, dtype=torch.float32)`
* `[2] 4 torch.float32 -> torch.randn(2, 4, 4, dtype=torch.float32)`
| shape | cpu_time (ms) | gpu_time_before (magma) (ms) | gpu_time_after (ms) |
| --- | --- | --- | --- |
| [] 2 torch.float32 | 0.095 | 7.534 | 0.129 |
| [] 4 torch.float32 | 0.009 | 7.522 | 0.129 |
| [] 8 torch.float32 | 0.011 | 7.647 | 0.138 |
| [] 16 torch.float32 | 0.075 | 7.582 | 0.135 |
| [] 32 torch.float32 | 0.073 | 7.573 | 0.191 |
| [] 64 torch.float32 | 0.134 | 7.694 | 0.288 |
| [] 128 torch.float32 | 0.398 | 8.073 | 0.491 |
| [] 256 torch.float32 | 1.054 | 11.860 | 1.074 |
| [] 512 torch.float32 | 5.218 | 14.130 | 2.582 |
| [] 1024 torch.float32 | 19.010 | 18.780 | 6.936 |
| [1] 2 torch.float32 | 0.009 | 0.113 | 0.128 ***regressed |
| [1] 4 torch.float32 | 0.009 | 0.113 | 0.131 ***regressed |
| [1] 8 torch.float32 | 0.011 | 0.116 | 0.129 ***regressed |
| [1] 16 torch.float32 | 0.015 | 0.122 | 0.135 ***regressed |
| [1] 32 torch.float32 | 0.032 | 0.177 | 0.178 ***regressed |
| [1] 64 torch.float32 | 0.070 | 0.420 | 0.281 |
| [1] 128 torch.float32 | 0.328 | 0.816 | 0.490 |
| [1] 256 torch.float32 | 1.125 | 1.690 | 1.084 |
| [1] 512 torch.float32 | 4.344 | 4.305 | 2.576 |
| [1] 1024 torch.float32 | 16.510 | 16.340 | 6.928 |
| [2] 2 torch.float32 | 0.009 | 0.113 | 0.186 ***regressed |
| [2] 4 torch.float32 | 0.011 | 0.115 | 0.184 ***regressed |
| [2] 8 torch.float32 | 0.012 | 0.114 | 0.184 ***regressed |
| [2] 16 torch.float32 | 0.019 | 0.119 | 0.173 ***regressed |
| [2] 32 torch.float32 | 0.050 | 0.170 | 0.240 ***regressed |
| [2] 64 torch.float32 | 0.120 | 0.429 | 0.375 |
| [2] 128 torch.float32 | 0.576 | 0.830 | 0.675 |
| [2] 256 torch.float32 | 2.021 | 1.748 | 1.451 |
| [2] 512 torch.float32 | 9.070 | 4.749 | 3.539 |
| [2] 1024 torch.float32 | 33.655 | 18.240 | 12.220 |
| [4] 2 torch.float32 | 0.009 | 0.112 | 0.318 ***regressed |
| [4] 4 torch.float32 | 0.010 | 0.115 | 0.319 ***regressed |
| [4] 8 torch.float32 | 0.013 | 0.115 | 0.320 ***regressed |
| [4] 16 torch.float32 | 0.027 | 0.120 | 0.331 ***regressed |
| [4] 32 torch.float32 | 0.085 | 0.173 | 0.385 ***regressed |
| [4] 64 torch.float32 | 0.221 | 0.431 | 0.646 ***regressed |
| [4] 128 torch.float32 | 1.102 | 0.834 | 1.055 ***regressed |
| [4] 256 torch.float32 | 4.042 | 1.811 | 2.054 ***regressed |
| [4] 512 torch.float32 | 18.390 | 4.884 | 5.087 ***regressed |
| [4] 1024 torch.float32 | 69.025 | 19.840 | 20.000 ***regressed |
</details>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/42403
Reviewed By: ailzhang, mruberry
Differential Revision: D23717984
Pulled By: ngimel
fbshipit-source-id: 54cbd9ea72a97989cff4127089938e8a8e29a72b
2020-09-19 03:40:39 +00:00
|
|
|
"aten/src/ATen/native/cuda/BatchLinearAlgebraLib.cu.cc",
|
2020-04-07 05:48:33 +00:00
|
|
|
"aten/src/ATen/native/cuda/BinaryArithmeticKernel.cu.cc",
|
|
|
|
|
"aten/src/ATen/native/cuda/BinaryCompareKernel.cu.cc",
|
|
|
|
|
"aten/src/ATen/native/cuda/BinaryMiscOpsKernels.cu.cc",
|
|
|
|
|
"aten/src/ATen/native/cuda/CUDAScalar.cu.cc",
|
|
|
|
|
"aten/src/ATen/native/cuda/Col2Im.cu.cc",
|
|
|
|
|
"aten/src/ATen/native/cuda/Copy.cu.cc",
|
|
|
|
|
"aten/src/ATen/native/cuda/CrossKernel.cu.cc",
|
|
|
|
|
"aten/src/ATen/native/cuda/DilatedMaxPool2d.cu.cc",
|
|
|
|
|
"aten/src/ATen/native/cuda/DilatedMaxPool3d.cu.cc",
|
|
|
|
|
"aten/src/ATen/native/cuda/DistanceKernel.cu.cc",
|
|
|
|
|
"aten/src/ATen/native/cuda/Distributions.cu.cc",
|
|
|
|
|
"aten/src/ATen/native/cuda/Dropout.cu.cc",
|
|
|
|
|
"aten/src/ATen/native/cuda/Embedding.cu.cc",
|
|
|
|
|
"aten/src/ATen/native/cuda/EmbeddingBackwardKernel.cu.cc",
|
|
|
|
|
"aten/src/ATen/native/cuda/EmbeddingBag.cu.cc",
|
|
|
|
|
"aten/src/ATen/native/cuda/FillKernel.cu.cc",
|
|
|
|
|
"aten/src/ATen/native/cuda/FractionalMaxPool2d.cu.cc",
|
|
|
|
|
"aten/src/ATen/native/cuda/FractionalMaxPool3d.cu.cc",
|
|
|
|
|
"aten/src/ATen/native/cuda/GridSampler.cu.cc",
|
|
|
|
|
"aten/src/ATen/native/cuda/Im2Col.cu.cc",
|
|
|
|
|
"aten/src/ATen/native/cuda/IndexKernel.cu.cc",
|
|
|
|
|
"aten/src/ATen/native/cuda/Indexing.cu.cc",
|
|
|
|
|
"aten/src/ATen/native/cuda/Lerp.cu.cc",
|
|
|
|
|
"aten/src/ATen/native/cuda/LinearAlgebra.cu.cc",
|
|
|
|
|
"aten/src/ATen/native/cuda/Loss.cu.cc",
|
|
|
|
|
"aten/src/ATen/native/cuda/LossCTC.cu.cc",
|
|
|
|
|
"aten/src/ATen/native/cuda/MaxUnpooling.cu.cc",
|
|
|
|
|
"aten/src/ATen/native/cuda/MultinomialKernel.cu.cc",
|
MAINT Migrates multilabel_margin_loss from THC to ATen (CUDA) (#60708)
Summary:
Fixes https://github.com/pytorch/pytorch/issues/24603
Fixes https://github.com/pytorch/pytorch/issues/24602
<s>The implementation should be exactly the same, so it is strange that the benchmarks show such a significant improvement in this PR.</s>
The benchmarks are now the same.
<details>
<summary>Benchmark script</summary>
```python
from itertools import product
import torch
import torch.nn as nn
import torch.nn.functional as F
import time
torch.manual_seed(0)
MS_PER_SECOND = 1000
def _time():
torch.cuda.synchronize()
return time.perf_counter() * MS_PER_SECOND
device = "cuda"
C = 30
n_runs = 100
reductions = ["none", "sum", "mean"]
Ns = [1_000, 10_000, 100_000]
for reduction, N in product(reductions, Ns):
total_fwd_time = 0
total_back_time = 0
grad_out = torch.randn(N, device=device)
if reduction != "none":
grad_out = grad_out[0]
for _ in range(n_runs):
input = torch.randn(N, C, device=device, requires_grad=True)
target = torch.randint(0, C, size=input.size(), device=device)
# forward
start = _time()
result = F.multilabel_margin_loss(input, target, reduction=reduction)
total_fwd_time += _time() - start
result = F.multilabel_margin_loss(input, target, reduction=reduction)
for _ in range(n_runs):
# backward
start = _time()
result.backward(grad_out, retain_graph=True)
total_back_time += _time() - start
fwd_avg = total_fwd_time / n_runs
bwd_avg = total_back_time / n_runs
print(
f"input size({N}, {C}), reduction: {reduction}, fwd: {fwd_avg:.2f} (ms), back: {bwd_avg:.2f} (ms)"
)
```
</details>
## master
```
input size(1000, 30), reduction: none, fwd: 0.14 (ms), back: 0.41 (ms)
input size(10000, 30), reduction: none, fwd: 1.26 (ms), back: 3.58 (ms)
input size(100000, 30), reduction: none, fwd: 13.15 (ms), back: 34.68 (ms)
input size(1000, 30), reduction: sum, fwd: 0.14 (ms), back: 0.38 (ms)
input size(10000, 30), reduction: sum, fwd: 1.16 (ms), back: 3.53 (ms)
input size(100000, 30), reduction: sum, fwd: 13.04 (ms), back: 34.53 (ms)
input size(1000, 30), reduction: mean, fwd: 0.14 (ms), back: 0.38 (ms)
input size(10000, 30), reduction: mean, fwd: 1.17 (ms), back: 3.52 (ms)
input size(100000, 30), reduction: mean, fwd: 13.12 (ms), back: 34.54 (ms)
```
## this PR
```
input size(1000, 30), reduction: none, fwd: 0.14 (ms), back: 0.35 (ms)
input size(10000, 30), reduction: none, fwd: 1.22 (ms), back: 2.98 (ms)
input size(100000, 30), reduction: none, fwd: 12.90 (ms), back: 29.32 (ms)
input size(1000, 30), reduction: sum, fwd: 0.14 (ms), back: 0.32 (ms)
input size(10000, 30), reduction: sum, fwd: 1.16 (ms), back: 2.97 (ms)
input size(100000, 30), reduction: sum, fwd: 13.00 (ms), back: 29.17 (ms)
input size(1000, 30), reduction: mean, fwd: 0.14 (ms), back: 0.32 (ms)
input size(10000, 30), reduction: mean, fwd: 1.17 (ms), back: 2.97 (ms)
input size(100000, 30), reduction: mean, fwd: 13.09 (ms), back: 28.91 (ms)
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/60708
Reviewed By: saketh-are
Differential Revision: D29856579
Pulled By: ngimel
fbshipit-source-id: b6bbf27a71e5a04f61779f6fef4ed1c98baa2607
2021-07-23 20:44:20 +00:00
|
|
|
"aten/src/ATen/native/cuda/MultiLabelMarginCriterion.cu.cc",
|
2020-04-07 05:48:33 +00:00
|
|
|
"aten/src/ATen/native/cuda/NaiveConvolutionTranspose2d.cu.cc",
|
|
|
|
|
"aten/src/ATen/native/cuda/NaiveConvolutionTranspose3d.cu.cc",
|
|
|
|
|
"aten/src/ATen/native/cuda/NaiveDilatedConvolution.cu.cc",
|
ENH Migrate nll_loss2d from THC to ATen (#62826)
Summary:
Fixes https://github.com/pytorch/pytorch/issues/24608
Fixes https://github.com/pytorch/pytorch/issues/24607
With the following benchmark, the backward pass runs a little slower. This is strange since the implementation should be exactly the same.
<details>
<summary>Benchmark script</summary>
```python
from itertools import product
import torch
import torch.nn as nn
import torch.nn.functional as F
import time
torch.manual_seed(0)
MS_PER_SECOND = 1000
def _time():
torch.cuda.synchronize()
return time.perf_counter() * MS_PER_SECOND
device = "cuda"
C = 3
n_runs = 30
reductions = ["none", "sum", "mean"]
Ns = [128, 256, 512]
Hs = [128, 256, 512]
for reduction, N, H in product(reductions, Ns, Hs):
total_fwd_time = 0
total_back_time = 0
if reduction == "none":
grad_out = torch.randn(N, H, H, device=device)
else:
grad_out = torch.randn(1)[0]
for _ in range(n_runs):
input = torch.randn(N, C, H, H, device=device, requires_grad=True)
target = torch.rand(N, H, H, device=device).mul(3).floor().long()
# forward
start = _time()
result = F.nll_loss(input, target, reduction=reduction)
total_fwd_time += _time() - start
result = F.nll_loss(input, target, reduction=reduction)
for _ in range(n_runs):
# backward
start = _time()
result.backward(grad_out, retain_graph=True)
total_back_time += _time() - start
fwd_avg = total_fwd_time / n_runs
bwd_avg = total_back_time / n_runs
print(
f"input size({N}, {C}, {H}, {H}), reduction: {reduction}, fwd: {fwd_avg:.2f} (ms), back: {bwd_avg:.2f} (ms)"
)
```
</details>
<details>
<summary>master results</summary>
```
input size(128, 3, 128, 128), reduction: none, fwd: 0.34 (ms), back: 0.57 (ms)
input size(128, 3, 256, 256), reduction: none, fwd: 2.56 (ms), back: 3.85 (ms)
input size(128, 3, 512, 512), reduction: none, fwd: 14.54 (ms), back: 16.62 (ms)
input size(256, 3, 128, 128), reduction: none, fwd: 1.26 (ms), back: 1.78 (ms)
input size(256, 3, 256, 256), reduction: none, fwd: 7.07 (ms), back: 8.22 (ms)
input size(256, 3, 512, 512), reduction: none, fwd: 29.38 (ms), back: 33.29 (ms)
input size(512, 3, 128, 128), reduction: none, fwd: 3.41 (ms), back: 4.05 (ms)
input size(512, 3, 256, 256), reduction: none, fwd: 14.32 (ms), back: 16.46 (ms)
input size(512, 3, 512, 512), reduction: none, fwd: 59.20 (ms), back: 66.68 (ms)
input size(128, 3, 128, 128), reduction: sum, fwd: 0.08 (ms), back: 0.21 (ms)
input size(128, 3, 256, 256), reduction: sum, fwd: 0.21 (ms), back: 0.73 (ms)
input size(128, 3, 512, 512), reduction: sum, fwd: 0.82 (ms), back: 2.86 (ms)
input size(256, 3, 128, 128), reduction: sum, fwd: 0.12 (ms), back: 0.39 (ms)
input size(256, 3, 256, 256), reduction: sum, fwd: 0.42 (ms), back: 1.45 (ms)
input size(256, 3, 512, 512), reduction: sum, fwd: 1.53 (ms), back: 5.66 (ms)
input size(512, 3, 128, 128), reduction: sum, fwd: 0.21 (ms), back: 0.74 (ms)
input size(512, 3, 256, 256), reduction: sum, fwd: 0.78 (ms), back: 2.86 (ms)
input size(512, 3, 512, 512), reduction: sum, fwd: 2.98 (ms), back: 11.23 (ms)
input size(128, 3, 128, 128), reduction: mean, fwd: 0.07 (ms), back: 0.21 (ms)
input size(128, 3, 256, 256), reduction: mean, fwd: 0.21 (ms), back: 0.73 (ms)
input size(128, 3, 512, 512), reduction: mean, fwd: 0.82 (ms), back: 2.86 (ms)
input size(256, 3, 128, 128), reduction: mean, fwd: 0.13 (ms), back: 0.39 (ms)
input size(256, 3, 256, 256), reduction: mean, fwd: 0.42 (ms), back: 1.45 (ms)
input size(256, 3, 512, 512), reduction: mean, fwd: 1.54 (ms), back: 5.65 (ms)
input size(512, 3, 128, 128), reduction: mean, fwd: 0.22 (ms), back: 0.74 (ms)
input size(512, 3, 256, 256), reduction: mean, fwd: 0.78 (ms), back: 2.87 (ms)
input size(512, 3, 512, 512), reduction: mean, fwd: 2.98 (ms), back: 11.23 (ms)
```
</details>
<details>
<summary>PR results</summary>
```
input size(128, 3, 128, 128), reduction: none, fwd: 0.33 (ms), back: 0.59 (ms)
input size(128, 3, 256, 256), reduction: none, fwd: 2.51 (ms), back: 3.92 (ms)
input size(128, 3, 512, 512), reduction: none, fwd: 14.52 (ms), back: 17.05 (ms)
input size(256, 3, 128, 128), reduction: none, fwd: 1.23 (ms), back: 1.85 (ms)
input size(256, 3, 256, 256), reduction: none, fwd: 7.07 (ms), back: 8.45 (ms)
input size(256, 3, 512, 512), reduction: none, fwd: 29.39 (ms), back: 34.21 (ms)
input size(512, 3, 128, 128), reduction: none, fwd: 3.40 (ms), back: 4.18 (ms)
input size(512, 3, 256, 256), reduction: none, fwd: 14.33 (ms), back: 16.90 (ms)
input size(512, 3, 512, 512), reduction: none, fwd: 59.04 (ms), back: 68.36 (ms)
input size(128, 3, 128, 128), reduction: sum, fwd: 0.07 (ms), back: 0.25 (ms)
input size(128, 3, 256, 256), reduction: sum, fwd: 0.21 (ms), back: 0.86 (ms)
input size(128, 3, 512, 512), reduction: sum, fwd: 0.82 (ms), back: 3.33 (ms)
input size(256, 3, 128, 128), reduction: sum, fwd: 0.12 (ms), back: 0.46 (ms)
input size(256, 3, 256, 256), reduction: sum, fwd: 0.42 (ms), back: 1.70 (ms)
input size(256, 3, 512, 512), reduction: sum, fwd: 1.53 (ms), back: 6.58 (ms)
input size(512, 3, 128, 128), reduction: sum, fwd: 0.21 (ms), back: 0.87 (ms)
input size(512, 3, 256, 256), reduction: sum, fwd: 0.78 (ms), back: 3.34 (ms)
input size(512, 3, 512, 512), reduction: sum, fwd: 2.98 (ms), back: 13.07 (ms)
input size(128, 3, 128, 128), reduction: mean, fwd: 0.07 (ms), back: 0.26 (ms)
input size(128, 3, 256, 256), reduction: mean, fwd: 0.21 (ms), back: 0.86 (ms)
input size(128, 3, 512, 512), reduction: mean, fwd: 0.82 (ms), back: 3.34 (ms)
input size(256, 3, 128, 128), reduction: mean, fwd: 0.12 (ms), back: 0.46 (ms)
input size(256, 3, 256, 256), reduction: mean, fwd: 0.42 (ms), back: 1.72 (ms)
input size(256, 3, 512, 512), reduction: mean, fwd: 1.53 (ms), back: 6.60 (ms)
input size(512, 3, 128, 128), reduction: mean, fwd: 0.21 (ms), back: 0.87 (ms)
input size(512, 3, 256, 256), reduction: mean, fwd: 0.78 (ms), back: 3.33 (ms)
input size(512, 3, 512, 512), reduction: mean, fwd: 2.98 (ms), back: 13.07 (ms)
```
</details>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/62826
Reviewed By: bdhirsh
Differential Revision: D30282279
Pulled By: ngimel
fbshipit-source-id: 4aa0ff3f8af0632957417931d332ec486a12b52d
2021-08-13 01:05:29 +00:00
|
|
|
"aten/src/ATen/native/cuda/NLLLoss2d.cu.cc",
|
2020-04-07 05:48:33 +00:00
|
|
|
"aten/src/ATen/native/cuda/Normalization.cu.cc",
|
|
|
|
|
"aten/src/ATen/native/cuda/PointwiseOpsKernel.cu.cc",
|
|
|
|
|
"aten/src/ATen/native/cuda/PowKernel.cu.cc",
|
|
|
|
|
"aten/src/ATen/native/cuda/RNN.cu.cc",
|
|
|
|
|
"aten/src/ATen/native/cuda/RangeFactories.cu.cc",
|
|
|
|
|
"aten/src/ATen/native/cuda/Reduce.cu.cc",
|
|
|
|
|
"aten/src/ATen/native/cuda/ReduceOpsKernel.cu.cc",
|
|
|
|
|
"aten/src/ATen/native/cuda/ReflectionPad.cu.cc",
|
|
|
|
|
"aten/src/ATen/native/cuda/Repeat.cu.cc",
|
|
|
|
|
"aten/src/ATen/native/cuda/ReplicationPadding.cu.cc",
|
|
|
|
|
"aten/src/ATen/native/cuda/Resize.cu.cc",
|
2021-04-27 18:27:36 +00:00
|
|
|
"aten/src/ATen/native/cuda/SegmentReduce.cu.cc",
|
2020-04-07 05:48:33 +00:00
|
|
|
"aten/src/ATen/native/cuda/SoftMax.cu.cc",
|
|
|
|
|
"aten/src/ATen/native/cuda/SortingKthValue.cu.cc",
|
|
|
|
|
"aten/src/ATen/native/cuda/SparseMM.cu.cc",
|
|
|
|
|
"aten/src/ATen/native/cuda/SpectralOps.cu.cc",
|
|
|
|
|
"aten/src/ATen/native/cuda/SummaryOps.cu.cc",
|
|
|
|
|
"aten/src/ATen/native/cuda/TensorCompare.cu.cc",
|
|
|
|
|
"aten/src/ATen/native/cuda/TensorFactories.cu.cc",
|
2021-04-27 21:47:05 +00:00
|
|
|
"aten/src/ATen/native/cuda/TensorTopK.cu.cc",
|
2020-04-07 05:48:33 +00:00
|
|
|
"aten/src/ATen/native/cuda/TensorTransformations.cu.cc",
|
|
|
|
|
"aten/src/ATen/native/cuda/TriangularOps.cu.cc",
|
|
|
|
|
"aten/src/ATen/native/cuda/UnaryOpsKernel.cu.cc",
|
[special] Add `i0e` (#54409)
Summary:
Reference: https://github.com/pytorch/pytorch/issues/50345
Changes:
* Add `i0e`
* Move some kernels from `UnaryOpsKernel.cu` to `UnarySpecialOpsKernel.cu` to decrease compilation time per file.
Time taken by i0e_vs_scipy tests: around 6.33.s
<details>
<summary>Test Run Log</summary>
```
(pytorch-cuda-dev) kshiteej@qgpu1:~/Pytorch/pytorch_module_special$ pytest test/test_unary_ufuncs.py -k _i0e_vs
======================================================================= test session starts ========================================================================
platform linux -- Python 3.8.6, pytest-6.1.2, py-1.9.0, pluggy-0.13.1
rootdir: /home/kshiteej/Pytorch/pytorch_module_special, configfile: pytest.ini
plugins: hypothesis-5.38.1
collected 8843 items / 8833 deselected / 10 selected
test/test_unary_ufuncs.py ...sss.... [100%]
========================================================================= warnings summary =========================================================================
../../.conda/envs/pytorch-cuda-dev/lib/python3.8/site-packages/torch/backends/cudnn/__init__.py:73
test/test_unary_ufuncs.py::TestUnaryUfuncsCUDA::test_special_i0e_vs_scipy_cuda_bfloat16
/home/kshiteej/.conda/envs/pytorch-cuda-dev/lib/python3.8/site-packages/torch/backends/cudnn/__init__.py:73: UserWarning: PyTorch was compiled without cuDNN/MIOpen support. To use cuDNN/MIOpen, rebuild PyTorch making sure the library is visible to the build system.
warnings.warn(
-- Docs: https://docs.pytest.org/en/stable/warnings.html
===================================================================== short test summary info ======================================================================
SKIPPED [3] test/test_unary_ufuncs.py:1182: not implemented: Could not run 'aten::_copy_from' with arguments from the 'Meta' backend. This could be because the operator doesn't exist for this backend, or was omitted during the selective/custom build process (if using custom build). If you are a Facebook employee using PyTorch on mobile, please visit https://fburl.com/ptmfixes for possible resolutions. 'aten::_copy_from' is only available for these backends: [BackendSelect, Named, InplaceOrView, AutogradOther, AutogradCPU, AutogradCUDA, AutogradXLA, UNKNOWN_TENSOR_TYPE_ID, AutogradMLC, AutogradNestedTensor, AutogradPrivateUse1, AutogradPrivateUse2, AutogradPrivateUse3, Tracer, Autocast, Batched, VmapMode].
BackendSelect: fallthrough registered at ../aten/src/ATen/core/BackendSelectFallbackKernel.cpp:3 [backend fallback]
Named: registered at ../aten/src/ATen/core/NamedRegistrations.cpp:7 [backend fallback]
InplaceOrView: fallthrough registered at ../aten/src/ATen/core/VariableFallbackKernel.cpp:56 [backend fallback]
AutogradOther: registered at ../torch/csrc/autograd/generated/VariableType_4.cpp:8761 [autograd kernel]
AutogradCPU: registered at ../torch/csrc/autograd/generated/VariableType_4.cpp:8761 [autograd kernel]
AutogradCUDA: registered at ../torch/csrc/autograd/generated/VariableType_4.cpp:8761 [autograd kernel]
AutogradXLA: registered at ../torch/csrc/autograd/generated/VariableType_4.cpp:8761 [autograd kernel]
UNKNOWN_TENSOR_TYPE_ID: registered at ../torch/csrc/autograd/generated/VariableType_4.cpp:8761 [autograd kernel]
AutogradMLC: registered at ../torch/csrc/autograd/generated/VariableType_4.cpp:8761 [autograd kernel]
AutogradNestedTensor: registered at ../torch/csrc/autograd/generated/VariableType_4.cpp:8761 [autograd kernel]
AutogradPrivateUse1: registered at ../torch/csrc/autograd/generated/VariableType_4.cpp:8761 [autograd kernel]
AutogradPrivateUse2: registered at ../torch/csrc/autograd/generated/VariableType_4.cpp:8761 [autograd kernel]
AutogradPrivateUse3: registered at ../torch/csrc/autograd/generated/VariableType_4.cpp:8761 [autograd kernel]
Tracer: registered at ../torch/csrc/autograd/generated/TraceType_4.cpp:9348 [kernel]
Autocast: fallthrough registered at ../aten/src/ATen/autocast_mode.cpp:250 [backend fallback]
Batched: registered at ../aten/src/ATen/BatchingRegistrations.cpp:1016 [backend fallback]
VmapMode: fallthrough registered at ../aten/src/ATen/VmapModeRegistrations.cpp:33 [backend fallback]
==================================================== 7 passed, 3 skipped, 8833 deselected, 2 warnings in 6.33s =====================================================
```
</details>
TODO:
* [x] Check rendered docs (https://11743402-65600975-gh.circle-artifacts.com/0/docs/special.html)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/54409
Reviewed By: jbschlosser
Differential Revision: D27760472
Pulled By: mruberry
fbshipit-source-id: bdfbcaa798b00c51dc9513c34626246c8fc10548
2021-04-15 13:04:44 +00:00
|
|
|
"aten/src/ATen/native/cuda/UnarySpecialOpsKernel.cu.cc",
|
2020-04-07 05:48:33 +00:00
|
|
|
"aten/src/ATen/native/cuda/Unique.cu.cc",
|
|
|
|
|
"aten/src/ATen/native/cuda/UpSampleBicubic2d.cu.cc",
|
|
|
|
|
"aten/src/ATen/native/cuda/UpSampleBilinear2d.cu.cc",
|
|
|
|
|
"aten/src/ATen/native/cuda/UpSampleLinear1d.cu.cc",
|
|
|
|
|
"aten/src/ATen/native/cuda/UpSampleNearest1d.cu.cc",
|
|
|
|
|
"aten/src/ATen/native/cuda/UpSampleNearest2d.cu.cc",
|
|
|
|
|
"aten/src/ATen/native/cuda/UpSampleNearest3d.cu.cc",
|
|
|
|
|
"aten/src/ATen/native/cuda/UpSampleTrilinear3d.cu.cc",
|
|
|
|
|
"aten/src/ATen/native/cuda/WeightNorm.cu.cc",
|
|
|
|
|
"aten/src/ATen/native/cuda/layer_norm_kernel.cu.cc",
|
|
|
|
|
"aten/src/ATen/native/quantized/cuda/fake_quantize_core.cu.cc",
|
|
|
|
|
"aten/src/ATen/native/sparse/cuda/SparseCUDABlas.cu.cc",
|
|
|
|
|
"aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cu.cc",
|
|
|
|
|
"aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu.cc",
|
|
|
|
|
],
|
|
|
|
|
)
|
|
|
|
|
|
2020-08-27 19:09:34 +00:00
|
|
|
header_template_rule(
|
2020-04-07 05:48:33 +00:00
|
|
|
name = "aten_src_ATen_config",
|
|
|
|
|
src = "aten/src/ATen/Config.h.in",
|
|
|
|
|
out = "aten/src/ATen/Config.h",
|
|
|
|
|
substitutions = {
|
|
|
|
|
"@AT_MKLDNN_ENABLED@": "1",
|
|
|
|
|
"@AT_MKL_ENABLED@": "0",
|
2020-12-10 15:33:54 +00:00
|
|
|
"@AT_FFTW_ENABLED@": "0",
|
2021-06-30 23:27:07 +00:00
|
|
|
"@AT_POCKETFFT_ENABLED@": "0",
|
2020-04-07 05:48:33 +00:00
|
|
|
"@AT_NNPACK_ENABLED@": "0",
|
|
|
|
|
"@CAFFE2_STATIC_LINK_CUDA_INT@": "0",
|
2021-06-22 17:53:55 +00:00
|
|
|
"@AT_BUILD_WITH_BLAS@": "1",
|
|
|
|
|
"@AT_BUILD_WITH_LAPACK@": "1",
|
2020-06-19 03:17:48 +00:00
|
|
|
"@AT_PARALLEL_OPENMP@": "0",
|
|
|
|
|
"@AT_PARALLEL_NATIVE@": "1",
|
|
|
|
|
"@AT_PARALLEL_NATIVE_TBB@": "0",
|
2021-06-22 17:53:55 +00:00
|
|
|
"@AT_BLAS_F2C@": "0",
|
|
|
|
|
"@AT_BLAS_USE_CBLAS_DOT@": "1",
|
2020-04-07 05:48:33 +00:00
|
|
|
},
|
|
|
|
|
)
|
|
|
|
|
|
2020-08-27 19:09:34 +00:00
|
|
|
header_template_rule(
|
2020-04-07 05:48:33 +00:00
|
|
|
name = "aten_src_ATen_cuda_config",
|
|
|
|
|
src = "aten/src/ATen/cuda/CUDAConfig.h.in",
|
|
|
|
|
out = "aten/src/ATen/cuda/CUDAConfig.h",
|
|
|
|
|
substitutions = {
|
|
|
|
|
"@AT_CUDNN_ENABLED@": "1",
|
|
|
|
|
"@AT_ROCM_ENABLED@": "0",
|
2021-10-14 00:48:28 +00:00
|
|
|
"@AT_MAGMA_ENABLED@": "0",
|
2020-04-07 05:48:33 +00:00
|
|
|
"@NVCC_FLAGS_EXTRA@": "",
|
|
|
|
|
},
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
cc_library(
|
|
|
|
|
name = "aten_headers",
|
|
|
|
|
hdrs = [
|
2021-12-07 23:22:44 +00:00
|
|
|
"torch/csrc/Export.h",
|
2020-04-07 05:48:33 +00:00
|
|
|
"torch/csrc/jit/frontend/function_schema_parser.h",
|
|
|
|
|
] + glob([
|
2020-05-19 05:00:55 +00:00
|
|
|
"aten/src/**/*.h",
|
|
|
|
|
"aten/src/**/*.hpp",
|
|
|
|
|
"aten/src/TH/**/*.cpp",
|
2020-04-07 05:48:33 +00:00
|
|
|
"aten/src/THC/*.cuh",
|
2020-10-15 03:03:37 +00:00
|
|
|
],
|
|
|
|
|
exclude = [
|
|
|
|
|
"aten/src/ATen/Config.h",
|
|
|
|
|
],) + [
|
2020-04-07 05:48:33 +00:00
|
|
|
":generated_cpp",
|
|
|
|
|
":aten_src_ATen_config",
|
|
|
|
|
],
|
|
|
|
|
includes = [
|
|
|
|
|
"aten/src",
|
|
|
|
|
"aten/src/TH",
|
|
|
|
|
],
|
|
|
|
|
deps = [
|
|
|
|
|
":c10_headers",
|
|
|
|
|
],
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
ATEN_COPTS = COMMON_COPTS + [
|
|
|
|
|
"-DUSE_AVX",
|
|
|
|
|
"-DUSE_AVX2",
|
|
|
|
|
"-DCAFFE2_BUILD_MAIN_LIBS",
|
|
|
|
|
"-DHAVE_AVX_CPU_DEFINITION",
|
|
|
|
|
"-DHAVE_AVX2_CPU_DEFINITION",
|
|
|
|
|
"-fvisibility-inlines-hidden",
|
|
|
|
|
"-fno-math-errno",
|
|
|
|
|
"-fno-trapping-math",
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
intern_build_aten_ops(
|
|
|
|
|
copts = ATEN_COPTS,
|
|
|
|
|
deps = [
|
|
|
|
|
":aten_headers",
|
2020-04-28 05:47:09 +00:00
|
|
|
"@sleef",
|
2020-04-07 05:48:33 +00:00
|
|
|
"@fbgemm",
|
|
|
|
|
],
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
cc_library(
|
|
|
|
|
name = "aten",
|
|
|
|
|
srcs = [
|
|
|
|
|
":ATen_CORE_SRCS",
|
|
|
|
|
":ATen_QUANTIZED_SRCS",
|
|
|
|
|
":aten_base_cpp",
|
2020-10-13 08:44:36 +00:00
|
|
|
":aten_base_metal",
|
2020-08-07 16:04:01 +00:00
|
|
|
":aten_base_vulkan",
|
2020-04-07 05:48:33 +00:00
|
|
|
":aten_native_cpp",
|
|
|
|
|
":aten_native_mkl_cpp",
|
|
|
|
|
":aten_native_mkldnn_cpp",
|
|
|
|
|
":aten_native_quantized_cpp",
|
|
|
|
|
":aten_native_sparse_cpp",
|
|
|
|
|
":aten_native_xnnpack",
|
|
|
|
|
":aten_src_ATen_config",
|
|
|
|
|
":generated_cpp",
|
|
|
|
|
],
|
|
|
|
|
copts = ATEN_COPTS,
|
|
|
|
|
data = if_cuda(
|
|
|
|
|
[":libcaffe2_nvrtc.so"],
|
|
|
|
|
[],
|
|
|
|
|
),
|
|
|
|
|
visibility = ["//visibility:public"],
|
|
|
|
|
deps = [
|
|
|
|
|
":ATen_CPU",
|
|
|
|
|
":aten_headers",
|
|
|
|
|
":caffe2_for_aten_headers",
|
|
|
|
|
":torch_headers",
|
|
|
|
|
"@fbgemm",
|
|
|
|
|
"@ideep",
|
|
|
|
|
],
|
|
|
|
|
alwayslink = True,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
cc_library(
|
|
|
|
|
name = "aten_nvrtc",
|
|
|
|
|
srcs = glob([
|
|
|
|
|
"aten/src/ATen/cuda/nvrtc_stub/*.cpp",
|
|
|
|
|
]),
|
|
|
|
|
copts = ATEN_COPTS,
|
|
|
|
|
linkstatic = True,
|
|
|
|
|
visibility = ["//visibility:public"],
|
|
|
|
|
deps = [
|
|
|
|
|
":aten_headers",
|
|
|
|
|
":c10_headers",
|
|
|
|
|
"@cuda",
|
|
|
|
|
"@cuda//:cuda_driver",
|
|
|
|
|
"@cuda//:nvrtc",
|
|
|
|
|
],
|
|
|
|
|
alwayslink = True,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
cc_binary(
|
|
|
|
|
name = "libcaffe2_nvrtc.so",
|
|
|
|
|
linkshared = True,
|
|
|
|
|
visibility = ["//visibility:public"],
|
|
|
|
|
deps = [
|
|
|
|
|
":aten_nvrtc",
|
|
|
|
|
],
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
cc_library(
|
|
|
|
|
name = "aten_cuda_cpp",
|
|
|
|
|
srcs = [":aten_cuda_srcs"],
|
|
|
|
|
copts = ATEN_COPTS,
|
|
|
|
|
visibility = ["//visibility:public"],
|
|
|
|
|
deps = [
|
|
|
|
|
":aten",
|
|
|
|
|
"@cuda",
|
|
|
|
|
"@cuda//:nvrtc",
|
|
|
|
|
"@cudnn",
|
|
|
|
|
],
|
|
|
|
|
alwayslink = True,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
torch_cuda_half_options = [
|
|
|
|
|
"-DCUDA_HAS_FP16=1",
|
|
|
|
|
"-D__CUDA_NO_HALF_OPERATORS__",
|
|
|
|
|
"-D__CUDA_NO_HALF_CONVERSIONS__",
|
2020-10-02 23:19:14 +00:00
|
|
|
"-D__CUDA_NO_BFLOAT16_CONVERSIONS__",
|
2020-04-07 05:48:33 +00:00
|
|
|
"-D__CUDA_NO_HALF2_OPERATORS__",
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
cu_library(
|
|
|
|
|
name = "aten_cuda",
|
|
|
|
|
srcs = [
|
|
|
|
|
":aten_srcs_cu",
|
|
|
|
|
],
|
|
|
|
|
copts = ATEN_COPTS + torch_cuda_half_options,
|
|
|
|
|
visibility = ["//visibility:public"],
|
|
|
|
|
deps = [
|
|
|
|
|
":aten_cuda_cpp",
|
|
|
|
|
"@cuda//:cublas",
|
|
|
|
|
"@cuda//:cufft",
|
|
|
|
|
"@cuda//:cusparse",
|
|
|
|
|
],
|
|
|
|
|
alwayslink = True,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# caffe2
|
|
|
|
|
CAFFE2_COPTS = COMMON_COPTS + [
|
|
|
|
|
"-Dcaffe2_EXPORTS",
|
|
|
|
|
"-DCAFFE2_USE_GLOO",
|
|
|
|
|
"-DCAFFE2_USE_CUDNN",
|
|
|
|
|
"-DCAFFE2_BUILD_MAIN_LIB",
|
|
|
|
|
"-fvisibility-inlines-hidden",
|
|
|
|
|
"-fno-math-errno",
|
|
|
|
|
"-fno-trapping-math",
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
proto_library(
|
|
|
|
|
name = "caffe2_proto_source",
|
|
|
|
|
srcs = glob([
|
|
|
|
|
"caffe2/proto/*.proto",
|
|
|
|
|
]),
|
|
|
|
|
visibility = ["//visibility:public"],
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
cc_proto_library(
|
|
|
|
|
name = "caffe2_protos",
|
|
|
|
|
deps = [":caffe2_proto_source"],
|
|
|
|
|
)
|
|
|
|
|
|
2020-08-27 19:09:34 +00:00
|
|
|
header_template_rule(
|
2020-04-07 05:48:33 +00:00
|
|
|
name = "caffe2_core_macros_h",
|
|
|
|
|
src = "caffe2/core/macros.h.in",
|
|
|
|
|
out = "caffe2/core/macros.h",
|
|
|
|
|
substitutions = {
|
|
|
|
|
"@CAFFE2_VERSION_MAJOR@": "1",
|
|
|
|
|
"@CAFFE2_VERSION_MINOR@": "3",
|
|
|
|
|
"@CAFFE2_VERSION_PATCH@": "0",
|
|
|
|
|
"cmakedefine": "define",
|
|
|
|
|
"#define CAFFE2_FORCE_FALLBACK_CUDA_MPI": "/* #undef CAFFE2_FORCE_FALLBACK_CUDA_MPI */",
|
|
|
|
|
"#define CAFFE2_HAS_MKL_DNN": "/* #undef CAFFE2_HAS_MKL_DNN */",
|
|
|
|
|
"#define CAFFE2_HAS_MKL_SGEMM_PACK": "/* #undef CAFFE2_HAS_MKL_SGEMM_PACK */",
|
|
|
|
|
"#define CAFFE2_THREADPOOL_MAIN_IMBALANCE": "/* #undef CAFFE2_THREADPOOL_MAIN_IMBALANCE */",
|
|
|
|
|
"#define CAFFE2_THREADPOOL_STATS": "/* #undef CAFFE2_THREADPOOL_STATS */",
|
|
|
|
|
"#define CAFFE2_USE_ACCELERATE": "/* #undef CAFFE2_USE_ACCELERATE */",
|
|
|
|
|
"#define CAFFE2_USE_EIGEN_FOR_BLAS": "/* #undef CAFFE2_USE_EIGEN_FOR_BLAS */",
|
|
|
|
|
"#define CAFFE2_USE_FBCODE": "/* #undef CAFFE2_USE_FBCODE */",
|
|
|
|
|
"#define CAFFE2_USE_GOOGLE_GLOG": "/* #undef CAFFE2_USE_GOOGLE_GLOG */",
|
|
|
|
|
"#define CAFFE2_USE_LITE_PROTO": "/* #undef CAFFE2_USE_LITE_PROTO */",
|
|
|
|
|
"#define CAFFE2_USE_MKL\n": "/* #undef CAFFE2_USE_MKL */\n",
|
|
|
|
|
"#define CAFFE2_USE_NVTX": "/* #undef CAFFE2_USE_NVTX */",
|
|
|
|
|
"#define CAFFE2_USE_TRT": "/* #undef CAFFE2_USE_TRT */",
|
|
|
|
|
},
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
filegroup(
|
|
|
|
|
name = "caffe2_contrib_srcs",
|
|
|
|
|
srcs = [
|
|
|
|
|
"caffe2/contrib/gloo/allgather_ops.cc",
|
|
|
|
|
"caffe2/contrib/gloo/allreduce_ops.cc",
|
|
|
|
|
"caffe2/contrib/gloo/barrier_ops.cc",
|
|
|
|
|
"caffe2/contrib/gloo/broadcast_ops.cc",
|
|
|
|
|
"caffe2/contrib/gloo/common.cc",
|
|
|
|
|
"caffe2/contrib/gloo/common_world_ops.cc",
|
|
|
|
|
"caffe2/contrib/gloo/context.cc",
|
|
|
|
|
"caffe2/contrib/gloo/reduce_scatter_ops.cc",
|
|
|
|
|
"caffe2/contrib/gloo/store_handler.cc",
|
|
|
|
|
],
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
filegroup(
|
|
|
|
|
name = "caffe2_core_srcs",
|
|
|
|
|
srcs = [
|
|
|
|
|
"caffe2/core/allocator.cc",
|
|
|
|
|
"caffe2/core/blob_serialization.cc",
|
|
|
|
|
"caffe2/core/blob_stats.cc",
|
|
|
|
|
"caffe2/core/common.cc",
|
|
|
|
|
"caffe2/core/context.cc",
|
|
|
|
|
"caffe2/core/context_base.cc",
|
|
|
|
|
"caffe2/core/db.cc",
|
|
|
|
|
"caffe2/core/event.cc",
|
|
|
|
|
"caffe2/core/export_c10_op_to_caffe2.cc",
|
|
|
|
|
"caffe2/core/graph.cc",
|
|
|
|
|
"caffe2/core/init.cc",
|
|
|
|
|
"caffe2/core/init_denormals.cc",
|
|
|
|
|
"caffe2/core/init_intrinsics_check.cc",
|
|
|
|
|
"caffe2/core/init_omp.cc",
|
|
|
|
|
"caffe2/core/int8_serialization.cc",
|
|
|
|
|
"caffe2/core/memonger.cc",
|
|
|
|
|
"caffe2/core/module.cc",
|
|
|
|
|
"caffe2/core/net.cc",
|
|
|
|
|
"caffe2/core/net_async_base.cc",
|
|
|
|
|
"caffe2/core/net_async_scheduling.cc",
|
|
|
|
|
"caffe2/core/net_async_task.cc",
|
|
|
|
|
"caffe2/core/net_async_task_future.cc",
|
|
|
|
|
"caffe2/core/net_async_task_graph.cc",
|
|
|
|
|
"caffe2/core/net_async_tracing.cc",
|
|
|
|
|
"caffe2/core/net_dag_utils.cc",
|
|
|
|
|
"caffe2/core/net_parallel.cc",
|
|
|
|
|
"caffe2/core/net_simple.cc",
|
|
|
|
|
"caffe2/core/net_simple_refcount.cc",
|
|
|
|
|
"caffe2/core/nomnigraph/Representations/NeuralNet.cc",
|
|
|
|
|
"caffe2/core/nomnigraph/tests/test_util.cc",
|
|
|
|
|
"caffe2/core/numa.cc",
|
|
|
|
|
"caffe2/core/operator.cc",
|
|
|
|
|
"caffe2/core/operator_schema.cc",
|
|
|
|
|
"caffe2/core/plan_executor.cc",
|
|
|
|
|
"caffe2/core/prof_dag_counters.cc",
|
|
|
|
|
"caffe2/core/qtensor.cc",
|
|
|
|
|
"caffe2/core/qtensor_serialization.cc",
|
|
|
|
|
"caffe2/core/stats.cc",
|
|
|
|
|
"caffe2/core/tensor.cc",
|
|
|
|
|
"caffe2/core/tensor_int8.cc",
|
|
|
|
|
"caffe2/core/test_utils.cc",
|
|
|
|
|
"caffe2/core/transform.cc",
|
|
|
|
|
"caffe2/core/types.cc",
|
|
|
|
|
"caffe2/core/workspace.cc",
|
|
|
|
|
],
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
filegroup(
|
|
|
|
|
name = "caffe2_distributed_srcs",
|
|
|
|
|
srcs = [
|
|
|
|
|
"caffe2/distributed/file_store_handler.cc",
|
|
|
|
|
"caffe2/distributed/file_store_handler_op.cc",
|
|
|
|
|
"caffe2/distributed/store_handler.cc",
|
|
|
|
|
"caffe2/distributed/store_ops.cc",
|
|
|
|
|
],
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
filegroup(
|
|
|
|
|
name = "caffe2_ideep_srcs",
|
|
|
|
|
srcs = [
|
|
|
|
|
"caffe2/ideep/operators/adam_op.cc",
|
|
|
|
|
"caffe2/ideep/operators/channel_shuffle_op.cc",
|
|
|
|
|
"caffe2/ideep/operators/concat_split_op.cc",
|
|
|
|
|
"caffe2/ideep/operators/conv_op.cc",
|
|
|
|
|
"caffe2/ideep/operators/conv_transpose_op.cc",
|
|
|
|
|
"caffe2/ideep/operators/dropout_op.cc",
|
|
|
|
|
"caffe2/ideep/operators/elementwise_sum_op.cc",
|
|
|
|
|
"caffe2/ideep/operators/expand_squeeze_dims_op.cc",
|
|
|
|
|
"caffe2/ideep/operators/fully_connected_op.cc",
|
|
|
|
|
"caffe2/ideep/operators/local_response_normalization_op.cc",
|
|
|
|
|
"caffe2/ideep/operators/momentum_sgd_op.cc",
|
|
|
|
|
"caffe2/ideep/operators/operator_fallback_ideep.cc",
|
|
|
|
|
"caffe2/ideep/operators/order_switch_ops.cc",
|
|
|
|
|
"caffe2/ideep/operators/pool_op.cc",
|
|
|
|
|
"caffe2/ideep/operators/quantization/int8_add_op.cc",
|
|
|
|
|
"caffe2/ideep/operators/quantization/int8_conv_op.cc",
|
|
|
|
|
"caffe2/ideep/operators/quantization/int8_dequantize_op.cc",
|
|
|
|
|
"caffe2/ideep/operators/quantization/int8_fully_connected_op.cc",
|
|
|
|
|
"caffe2/ideep/operators/quantization/int8_given_tensor_fill_op.cc",
|
|
|
|
|
"caffe2/ideep/operators/quantization/int8_pool_op.cc",
|
|
|
|
|
"caffe2/ideep/operators/quantization/int8_quantize_op.cc",
|
|
|
|
|
"caffe2/ideep/operators/quantization/int8_relu_op.cc",
|
|
|
|
|
"caffe2/ideep/operators/queue_ops.cc",
|
|
|
|
|
"caffe2/ideep/operators/relu_op.cc",
|
|
|
|
|
"caffe2/ideep/operators/reshape_op.cc",
|
|
|
|
|
"caffe2/ideep/operators/shape_op.cc",
|
|
|
|
|
"caffe2/ideep/operators/sigmoid_op.cc",
|
|
|
|
|
"caffe2/ideep/operators/spatial_batch_norm_op.cc",
|
|
|
|
|
"caffe2/ideep/operators/transpose_op.cc",
|
|
|
|
|
"caffe2/ideep/operators/utility_ops.cc",
|
|
|
|
|
"caffe2/ideep/utils/ideep_register.cc",
|
|
|
|
|
],
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
filegroup(
|
|
|
|
|
name = "caffe2_onnx_srcs",
|
|
|
|
|
srcs = [
|
|
|
|
|
"caffe2/onnx/backend.cc",
|
|
|
|
|
"caffe2/onnx/backend_rep.cc",
|
|
|
|
|
"caffe2/onnx/device.cc",
|
|
|
|
|
"caffe2/onnx/helper.cc",
|
|
|
|
|
"caffe2/onnx/offline_tensor.cc",
|
|
|
|
|
"caffe2/onnx/onnx_exporter.cc",
|
|
|
|
|
"caffe2/onnx/onnxifi_graph_info.cc",
|
|
|
|
|
"caffe2/onnx/onnxifi_init.cc",
|
|
|
|
|
],
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
filegroup(
|
|
|
|
|
name = "caffe2_operators_srcs",
|
|
|
|
|
srcs = [
|
|
|
|
|
"caffe2/operators/abs_op.cc",
|
|
|
|
|
"caffe2/operators/accumulate_op.cc",
|
|
|
|
|
"caffe2/operators/accuracy_op.cc",
|
|
|
|
|
"caffe2/operators/acos_op.cc",
|
|
|
|
|
"caffe2/operators/affine_channel_op.cc",
|
|
|
|
|
"caffe2/operators/alias_with_name.cc",
|
|
|
|
|
"caffe2/operators/apmeter_op.cc",
|
|
|
|
|
"caffe2/operators/arg_ops.cc",
|
|
|
|
|
"caffe2/operators/asin_op.cc",
|
|
|
|
|
"caffe2/operators/assert_op.cc",
|
|
|
|
|
"caffe2/operators/atan_op.cc",
|
|
|
|
|
"caffe2/operators/atomic_ops.cc",
|
|
|
|
|
"caffe2/operators/batch_box_cox_op.cc",
|
|
|
|
|
"caffe2/operators/batch_bucketize_op.cc",
|
|
|
|
|
"caffe2/operators/batch_gather_ops.cc",
|
|
|
|
|
"caffe2/operators/batch_matmul_op.cc",
|
|
|
|
|
"caffe2/operators/batch_moments_op.cc",
|
|
|
|
|
"caffe2/operators/batch_permutation_op.cc",
|
|
|
|
|
"caffe2/operators/batch_sparse_to_dense_op.cc",
|
|
|
|
|
"caffe2/operators/bbox_transform_op.cc",
|
|
|
|
|
"caffe2/operators/bisect_percentile_op.cc",
|
|
|
|
|
"caffe2/operators/boolean_mask_ops.cc",
|
|
|
|
|
"caffe2/operators/boolean_unmask_ops.cc",
|
|
|
|
|
"caffe2/operators/box_with_nms_limit_op.cc",
|
|
|
|
|
"caffe2/operators/bucketize_op.cc",
|
|
|
|
|
"caffe2/operators/byte_weight_dequant_op.cc",
|
|
|
|
|
"caffe2/operators/cast_op.cc",
|
|
|
|
|
"caffe2/operators/cbrt_op.cc",
|
|
|
|
|
"caffe2/operators/cc_bmm_bg_op.cc",
|
|
|
|
|
"caffe2/operators/ceil_op.cc",
|
|
|
|
|
"caffe2/operators/channel_backprop_stats_op.cc",
|
|
|
|
|
"caffe2/operators/channel_shuffle_op.cc",
|
|
|
|
|
"caffe2/operators/channel_stats_op.cc",
|
|
|
|
|
"caffe2/operators/clip_op.cc",
|
|
|
|
|
"caffe2/operators/collect_and_distribute_fpn_rpn_proposals_op.cc",
|
|
|
|
|
"caffe2/operators/communicator_op.cc",
|
|
|
|
|
"caffe2/operators/concat_split_op.cc",
|
|
|
|
|
"caffe2/operators/conditional_op.cc",
|
|
|
|
|
"caffe2/operators/conv_gradient_op.cc",
|
|
|
|
|
"caffe2/operators/conv_op.cc",
|
|
|
|
|
"caffe2/operators/conv_op_eigen.cc",
|
|
|
|
|
"caffe2/operators/conv_op_shared.cc",
|
|
|
|
|
"caffe2/operators/conv_transpose_gradient_op.cc",
|
|
|
|
|
"caffe2/operators/conv_transpose_op_mobile.cc",
|
|
|
|
|
"caffe2/operators/copy_op.cc",
|
|
|
|
|
"caffe2/operators/copy_rows_to_tensor_op.cc",
|
|
|
|
|
"caffe2/operators/cos_op.cc",
|
|
|
|
|
"caffe2/operators/cosh_op.cc",
|
|
|
|
|
"caffe2/operators/cosine_embedding_criterion_op.cc",
|
|
|
|
|
"caffe2/operators/counter_ops.cc",
|
|
|
|
|
"caffe2/operators/crash_op.cc",
|
|
|
|
|
"caffe2/operators/create_scope_op.cc",
|
|
|
|
|
"caffe2/operators/crf_viterbi_op.cc",
|
|
|
|
|
"caffe2/operators/cross_entropy_op.cc",
|
|
|
|
|
"caffe2/operators/ctc_beam_search_decoder_op.cc",
|
|
|
|
|
"caffe2/operators/ctc_greedy_decoder_op.cc",
|
|
|
|
|
"caffe2/operators/cube_op.cc",
|
|
|
|
|
"caffe2/operators/data_couple.cc",
|
|
|
|
|
"caffe2/operators/dataset_ops.cc",
|
|
|
|
|
"caffe2/operators/deform_conv_gradient_op.cc",
|
|
|
|
|
"caffe2/operators/deform_conv_op.cc",
|
|
|
|
|
"caffe2/operators/dense_vector_to_id_list_op.cc",
|
|
|
|
|
"caffe2/operators/distance_op.cc",
|
|
|
|
|
"caffe2/operators/do_op.cc",
|
|
|
|
|
"caffe2/operators/dropout_op.cc",
|
|
|
|
|
"caffe2/operators/elementwise_add_gradient_op.cc",
|
|
|
|
|
"caffe2/operators/elementwise_add_op.cc",
|
|
|
|
|
"caffe2/operators/elementwise_div_gradient_op.cc",
|
|
|
|
|
"caffe2/operators/elementwise_div_op.cc",
|
|
|
|
|
"caffe2/operators/elementwise_linear_op.cc",
|
|
|
|
|
"caffe2/operators/elementwise_logical_ops.cc",
|
|
|
|
|
"caffe2/operators/elementwise_mul_gradient_op.cc",
|
|
|
|
|
"caffe2/operators/elementwise_mul_op.cc",
|
|
|
|
|
"caffe2/operators/elementwise_ops.cc",
|
|
|
|
|
"caffe2/operators/elementwise_ops_schema.cc",
|
|
|
|
|
"caffe2/operators/elementwise_ops_utils.cc",
|
|
|
|
|
"caffe2/operators/elementwise_sub_gradient_op.cc",
|
|
|
|
|
"caffe2/operators/elementwise_sub_op.cc",
|
|
|
|
|
"caffe2/operators/elementwise_sum_op.cc",
|
|
|
|
|
"caffe2/operators/elu_op.cc",
|
|
|
|
|
"caffe2/operators/enforce_finite_op.cc",
|
|
|
|
|
"caffe2/operators/ensure_clipped_op.cc",
|
|
|
|
|
"caffe2/operators/ensure_cpu_output_op.cc",
|
|
|
|
|
"caffe2/operators/erf_op.cc",
|
|
|
|
|
"caffe2/operators/exp_op.cc",
|
|
|
|
|
"caffe2/operators/expand_op.cc",
|
|
|
|
|
"caffe2/operators/expand_squeeze_dims_op.cc",
|
|
|
|
|
"caffe2/operators/fc_inference.cc",
|
|
|
|
|
"caffe2/operators/feature_maps_ops.cc",
|
|
|
|
|
"caffe2/operators/feed_blob_op.cc",
|
|
|
|
|
"caffe2/operators/filler_op.cc",
|
|
|
|
|
"caffe2/operators/find_duplicate_elements_op.cc",
|
|
|
|
|
"caffe2/operators/find_op.cc",
|
|
|
|
|
"caffe2/operators/flatten_op.cc",
|
|
|
|
|
"caffe2/operators/flexible_top_k.cc",
|
|
|
|
|
"caffe2/operators/floor_op.cc",
|
|
|
|
|
"caffe2/operators/free_op.cc",
|
|
|
|
|
"caffe2/operators/fully_connected_op.cc",
|
|
|
|
|
"caffe2/operators/fused_rowwise_8bit_conversion_ops.cc",
|
|
|
|
|
"caffe2/operators/fused_rowwise_random_quantization_ops.cc",
|
|
|
|
|
"caffe2/operators/gather_fused_8bit_rowwise_op.cc",
|
|
|
|
|
"caffe2/operators/gather_op.cc",
|
|
|
|
|
"caffe2/operators/gather_ranges_to_dense_op.cc",
|
|
|
|
|
"caffe2/operators/gelu_op.cc",
|
|
|
|
|
"caffe2/operators/generate_proposals_op.cc",
|
|
|
|
|
"caffe2/operators/given_tensor_byte_string_to_uint8_fill_op.cc",
|
|
|
|
|
"caffe2/operators/given_tensor_fill_op.cc",
|
|
|
|
|
"caffe2/operators/glu_op.cc",
|
|
|
|
|
"caffe2/operators/group_norm_op.cc",
|
|
|
|
|
"caffe2/operators/gru_unit_op.cc",
|
|
|
|
|
"caffe2/operators/h_softmax_op.cc",
|
|
|
|
|
"caffe2/operators/half_float_ops.cc",
|
|
|
|
|
"caffe2/operators/hard_sigmoid_op.cc",
|
|
|
|
|
"caffe2/operators/heatmap_max_keypoint_op.cc",
|
|
|
|
|
"caffe2/operators/if_op.cc",
|
|
|
|
|
"caffe2/operators/im2col_op.cc",
|
|
|
|
|
"caffe2/operators/index_hash_ops.cc",
|
|
|
|
|
"caffe2/operators/index_ops.cc",
|
|
|
|
|
"caffe2/operators/inference_lstm_op.cc",
|
|
|
|
|
"caffe2/operators/instance_norm_gradient_op.cc",
|
|
|
|
|
"caffe2/operators/instance_norm_op.cc",
|
|
|
|
|
"caffe2/operators/integral_image_op.cc",
|
|
|
|
|
"caffe2/operators/is_empty_op.cc",
|
|
|
|
|
"caffe2/operators/jsd_op.cc",
|
|
|
|
|
"caffe2/operators/key_split_ops.cc",
|
|
|
|
|
"caffe2/operators/last_n_window_collector.cc",
|
|
|
|
|
"caffe2/operators/layer_norm_op.cc",
|
|
|
|
|
"caffe2/operators/leaky_relu_op.cc",
|
|
|
|
|
"caffe2/operators/length_split_op.cc",
|
|
|
|
|
"caffe2/operators/lengths_pad_op.cc",
|
|
|
|
|
"caffe2/operators/lengths_reducer_fused_8bit_rowwise_ops.cc",
|
|
|
|
|
"caffe2/operators/lengths_reducer_ops.cc",
|
|
|
|
|
"caffe2/operators/lengths_reducer_rowwise_8bit_ops.cc",
|
|
|
|
|
"caffe2/operators/lengths_tile_op.cc",
|
|
|
|
|
"caffe2/operators/lengths_top_k_op.cc",
|
|
|
|
|
"caffe2/operators/listwise_l2r_op.cc",
|
|
|
|
|
"caffe2/operators/load_save_op.cc",
|
|
|
|
|
"caffe2/operators/load_save_op_util.cc",
|
|
|
|
|
"caffe2/operators/local_response_normalization_op.cc",
|
|
|
|
|
"caffe2/operators/locally_connected_op.cc",
|
|
|
|
|
"caffe2/operators/locally_connected_op_util.cc",
|
|
|
|
|
"caffe2/operators/log_op.cc",
|
|
|
|
|
"caffe2/operators/logit_op.cc",
|
|
|
|
|
"caffe2/operators/loss_op.cc",
|
|
|
|
|
"caffe2/operators/lp_pool_op.cc",
|
|
|
|
|
"caffe2/operators/lpnorm_op.cc",
|
|
|
|
|
"caffe2/operators/lstm_unit_op.cc",
|
|
|
|
|
"caffe2/operators/map_ops.cc",
|
|
|
|
|
"caffe2/operators/margin_ranking_criterion_op.cc",
|
|
|
|
|
"caffe2/operators/matmul_op.cc",
|
|
|
|
|
"caffe2/operators/mean_op.cc",
|
|
|
|
|
"caffe2/operators/merge_id_lists_op.cc",
|
|
|
|
|
"caffe2/operators/minmax_gradient_ops.cc",
|
|
|
|
|
"caffe2/operators/minmax_ops.cc",
|
|
|
|
|
"caffe2/operators/mod_op.cc",
|
|
|
|
|
"caffe2/operators/moments_op.cc",
|
|
|
|
|
"caffe2/operators/multi_class_accuracy_op.cc",
|
|
|
|
|
"caffe2/operators/negate_gradient_op.cc",
|
|
|
|
|
"caffe2/operators/negative_op.cc",
|
|
|
|
|
"caffe2/operators/ngram_ops.cc",
|
|
|
|
|
"caffe2/operators/norm_planar_yuv_op.cc",
|
|
|
|
|
"caffe2/operators/normalize_l1_op.cc",
|
|
|
|
|
"caffe2/operators/normalize_op.cc",
|
|
|
|
|
"caffe2/operators/numpy_tile_op.cc",
|
|
|
|
|
"caffe2/operators/one_hot_ops.cc",
|
|
|
|
|
"caffe2/operators/onnx_while_op.cc",
|
|
|
|
|
"caffe2/operators/order_switch_ops.cc",
|
|
|
|
|
"caffe2/operators/pack_rnn_sequence_op.cc",
|
|
|
|
|
"caffe2/operators/pack_segments.cc",
|
|
|
|
|
"caffe2/operators/pad_op.cc",
|
|
|
|
|
"caffe2/operators/partition_ops.cc",
|
|
|
|
|
"caffe2/operators/percentile_op.cc",
|
|
|
|
|
"caffe2/operators/perplexity_op.cc",
|
|
|
|
|
"caffe2/operators/piecewise_linear_transform_op.cc",
|
|
|
|
|
"caffe2/operators/pool_gradient_op.cc",
|
|
|
|
|
"caffe2/operators/pool_op.cc",
|
|
|
|
|
"caffe2/operators/pool_op_util.cc",
|
|
|
|
|
"caffe2/operators/pow_op.cc",
|
|
|
|
|
"caffe2/operators/prelu_op.cc",
|
|
|
|
|
"caffe2/operators/prepend_dim_op.cc",
|
|
|
|
|
"caffe2/operators/quant_decode_op.cc",
|
|
|
|
|
"caffe2/operators/rank_loss_op.cc",
|
|
|
|
|
"caffe2/operators/reciprocal_gradient_op.cc",
|
|
|
|
|
"caffe2/operators/reciprocal_op.cc",
|
|
|
|
|
"caffe2/operators/reduce_front_back_max_ops.cc",
|
|
|
|
|
"caffe2/operators/reduce_front_back_mean_ops.cc",
|
|
|
|
|
"caffe2/operators/reduce_front_back_sum_ops.cc",
|
|
|
|
|
"caffe2/operators/reduce_ops.cc",
|
|
|
|
|
"caffe2/operators/reduction_ops.cc",
|
|
|
|
|
"caffe2/operators/relu_n_op.cc",
|
|
|
|
|
"caffe2/operators/relu_op.cc",
|
|
|
|
|
"caffe2/operators/remove_data_blocks_op.cc",
|
|
|
|
|
"caffe2/operators/replace_nan_op.cc",
|
|
|
|
|
"caffe2/operators/reservoir_sampling.cc",
|
|
|
|
|
"caffe2/operators/reshape_op.cc",
|
|
|
|
|
"caffe2/operators/resize_3d_op.cc",
|
|
|
|
|
"caffe2/operators/resize_op.cc",
|
|
|
|
|
"caffe2/operators/reverse_packed_segs_op.cc",
|
|
|
|
|
"caffe2/operators/rmac_regions_op.cc",
|
|
|
|
|
"caffe2/operators/rnn/recurrent_network_blob_fetcher_op.cc",
|
|
|
|
|
"caffe2/operators/rnn/recurrent_network_executor.cc",
|
|
|
|
|
"caffe2/operators/rnn/recurrent_network_op.cc",
|
|
|
|
|
"caffe2/operators/roi_align_gradient_op.cc",
|
|
|
|
|
"caffe2/operators/roi_align_op.cc",
|
|
|
|
|
"caffe2/operators/roi_align_rotated_gradient_op.cc",
|
|
|
|
|
"caffe2/operators/roi_align_rotated_op.cc",
|
|
|
|
|
"caffe2/operators/roi_pool_op.cc",
|
|
|
|
|
"caffe2/operators/rowmul_op.cc",
|
|
|
|
|
"caffe2/operators/rsqrt_op.cc",
|
|
|
|
|
"caffe2/operators/scale_blobs_op.cc",
|
|
|
|
|
"caffe2/operators/scale_op.cc",
|
|
|
|
|
"caffe2/operators/segment_reduction_op.cc",
|
|
|
|
|
"caffe2/operators/selu_op.cc",
|
|
|
|
|
"caffe2/operators/sequence_ops.cc",
|
|
|
|
|
"caffe2/operators/shape_op.cc",
|
|
|
|
|
"caffe2/operators/sigmoid_gradient_op.cc",
|
|
|
|
|
"caffe2/operators/sigmoid_op.cc",
|
|
|
|
|
"caffe2/operators/sin_op.cc",
|
|
|
|
|
"caffe2/operators/sinh_op.cc",
|
|
|
|
|
"caffe2/operators/sinusoid_position_encoding_op.cc",
|
|
|
|
|
"caffe2/operators/slice_op.cc",
|
|
|
|
|
"caffe2/operators/softmax_op.cc",
|
|
|
|
|
"caffe2/operators/softmax_utils.cc",
|
|
|
|
|
"caffe2/operators/softmax_with_loss_op.cc",
|
|
|
|
|
"caffe2/operators/softplus_op.cc",
|
|
|
|
|
"caffe2/operators/softsign_op.cc",
|
|
|
|
|
"caffe2/operators/space_batch_op.cc",
|
|
|
|
|
"caffe2/operators/sparse_dropout_with_replacement_op.cc",
|
|
|
|
|
"caffe2/operators/sparse_normalize_op.cc",
|
|
|
|
|
"caffe2/operators/sparse_to_dense_mask_op.cc",
|
|
|
|
|
"caffe2/operators/sparse_to_dense_op.cc",
|
|
|
|
|
"caffe2/operators/spatial_batch_norm_gradient_op.cc",
|
|
|
|
|
"caffe2/operators/spatial_batch_norm_op.cc",
|
|
|
|
|
"caffe2/operators/spatial_softmax_with_loss_op.cc",
|
|
|
|
|
"caffe2/operators/sqr_op.cc",
|
|
|
|
|
"caffe2/operators/sqrt_op.cc",
|
|
|
|
|
"caffe2/operators/square_root_divide_op.cc",
|
|
|
|
|
"caffe2/operators/stats_ops.cc",
|
|
|
|
|
"caffe2/operators/stats_put_ops.cc",
|
|
|
|
|
"caffe2/operators/stop_gradient.cc",
|
|
|
|
|
"caffe2/operators/string_ops.cc",
|
|
|
|
|
"caffe2/operators/stump_func_op.cc",
|
|
|
|
|
"caffe2/operators/stylizer_ops.cc",
|
|
|
|
|
"caffe2/operators/summarize_op.cc",
|
|
|
|
|
"caffe2/operators/swish_op.cc",
|
|
|
|
|
"caffe2/operators/tan_op.cc",
|
|
|
|
|
"caffe2/operators/tanh_gradient_op.cc",
|
|
|
|
|
"caffe2/operators/tanh_op.cc",
|
|
|
|
|
"caffe2/operators/tensor_protos_db_input.cc",
|
|
|
|
|
"caffe2/operators/text_file_reader.cc",
|
|
|
|
|
"caffe2/operators/text_file_reader_utils.cc",
|
|
|
|
|
"caffe2/operators/thresholded_relu_op.cc",
|
|
|
|
|
"caffe2/operators/tile_op.cc",
|
|
|
|
|
"caffe2/operators/top_k.cc",
|
|
|
|
|
"caffe2/operators/transpose_op.cc",
|
|
|
|
|
"caffe2/operators/tt_linear_op.cc",
|
|
|
|
|
"caffe2/operators/unique_ops.cc",
|
|
|
|
|
"caffe2/operators/upsample_op.cc",
|
|
|
|
|
"caffe2/operators/utility_ops.cc",
|
|
|
|
|
"caffe2/operators/variable_length_sequence_padding.cc",
|
|
|
|
|
"caffe2/operators/weighted_multi_sampling_op.cc",
|
|
|
|
|
"caffe2/operators/weighted_sample_op.cc",
|
|
|
|
|
"caffe2/operators/while_op.cc",
|
|
|
|
|
"caffe2/operators/workspace_ops.cc",
|
|
|
|
|
"caffe2/operators/zero_gradient_op.cc",
|
|
|
|
|
],
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
filegroup(
|
|
|
|
|
name = "caffe2_opt_srcs",
|
|
|
|
|
srcs = [
|
|
|
|
|
"caffe2/opt/annotations.cc",
|
|
|
|
|
"caffe2/opt/backend_cutting.cc",
|
|
|
|
|
"caffe2/opt/backend_transformer_base.cc",
|
|
|
|
|
"caffe2/opt/bound_shape_inferencer.cc",
|
|
|
|
|
"caffe2/opt/converter.cc",
|
|
|
|
|
"caffe2/opt/dead_code_elim.cc",
|
|
|
|
|
"caffe2/opt/device.cc",
|
|
|
|
|
"caffe2/opt/distributed.cc",
|
|
|
|
|
"caffe2/opt/distributed_converter.cc",
|
|
|
|
|
"caffe2/opt/fusion.cc",
|
|
|
|
|
"caffe2/opt/mobile.cc",
|
|
|
|
|
"caffe2/opt/onnxifi_op.cc",
|
|
|
|
|
"caffe2/opt/onnxifi_transformer.cc",
|
|
|
|
|
"caffe2/opt/optimize_ideep.cc",
|
|
|
|
|
"caffe2/opt/optimizer.cc",
|
|
|
|
|
"caffe2/opt/passes.cc",
|
|
|
|
|
"caffe2/opt/shape_info.cc",
|
|
|
|
|
"caffe2/opt/tvm_transformer.cc",
|
|
|
|
|
],
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
filegroup(
|
|
|
|
|
name = "caffe2_perfkernels_srcs",
|
|
|
|
|
srcs = [
|
|
|
|
|
"caffe2/perfkernels/adagrad.cc",
|
|
|
|
|
"caffe2/perfkernels/embedding_lookup.cc",
|
|
|
|
|
"caffe2/perfkernels/embedding_lookup_idx.cc",
|
|
|
|
|
"caffe2/perfkernels/fused_8bit_rowwise_embedding_lookup.cc",
|
|
|
|
|
"caffe2/perfkernels/fused_8bit_rowwise_embedding_lookup_idx.cc",
|
2020-06-20 04:24:23 +00:00
|
|
|
"caffe2/perfkernels/fused_nbit_rowwise_conversion.cc",
|
2020-04-07 05:48:33 +00:00
|
|
|
"caffe2/perfkernels/lstm_unit_cpu_common.cc",
|
|
|
|
|
"caffe2/perfkernels/math_cpu_base.cc",
|
|
|
|
|
"caffe2/perfkernels/typed_axpy.cc",
|
|
|
|
|
],
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
filegroup(
|
|
|
|
|
name = "caffe2_predictor_srcs",
|
|
|
|
|
srcs = [
|
|
|
|
|
"caffe2/predictor/emulator/data_filler.cc",
|
|
|
|
|
"caffe2/predictor/emulator/data_filler.h",
|
|
|
|
|
"caffe2/predictor/predictor.cc",
|
|
|
|
|
"caffe2/predictor/predictor_config.cc",
|
|
|
|
|
"caffe2/predictor/predictor_utils.cc",
|
|
|
|
|
],
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
filegroup(
|
|
|
|
|
name = "caffe2_quantization_srcs",
|
|
|
|
|
srcs = [
|
|
|
|
|
"caffe2/quantization/server/activation_distribution_observer.cc",
|
|
|
|
|
"caffe2/quantization/server/batch_matmul_dnnlowp_op.cc",
|
|
|
|
|
"caffe2/quantization/server/caffe2_dnnlowp_utils.cc",
|
|
|
|
|
"caffe2/quantization/server/channel_shuffle_dnnlowp_op.cc",
|
|
|
|
|
"caffe2/quantization/server/concat_dnnlowp_op.cc",
|
|
|
|
|
"caffe2/quantization/server/conv_dnnlowp_acc16_op.cc",
|
|
|
|
|
"caffe2/quantization/server/conv_dnnlowp_op.cc",
|
|
|
|
|
"caffe2/quantization/server/conv_relu_op.cc",
|
|
|
|
|
"caffe2/quantization/server/dequantize_dnnlowp_op.cc",
|
|
|
|
|
"caffe2/quantization/server/dnnlowp.cc",
|
|
|
|
|
"caffe2/quantization/server/dnnlowp_partition.cc",
|
|
|
|
|
"caffe2/quantization/server/dynamic_histogram.cc",
|
|
|
|
|
"caffe2/quantization/server/elementwise_add_dnnlowp_op.cc",
|
|
|
|
|
"caffe2/quantization/server/elementwise_linear_dnnlowp_op.cc",
|
|
|
|
|
"caffe2/quantization/server/elementwise_mul_dnnlowp_op.cc",
|
|
|
|
|
"caffe2/quantization/server/elementwise_sum_dnnlowp_op.cc",
|
|
|
|
|
"caffe2/quantization/server/elementwise_sum_relu_op.cc",
|
|
|
|
|
"caffe2/quantization/server/fbgemm_pack_matrix_cache.cc",
|
|
|
|
|
"caffe2/quantization/server/fbgemm_pack_op.cc",
|
|
|
|
|
"caffe2/quantization/server/fully_connected_dnnlowp_acc16_op.cc",
|
|
|
|
|
"caffe2/quantization/server/fully_connected_dnnlowp_op.cc",
|
|
|
|
|
"caffe2/quantization/server/fully_connected_fake_lowp_op.cc",
|
|
|
|
|
"caffe2/quantization/server/group_norm_dnnlowp_op.cc",
|
2020-06-25 00:02:09 +00:00
|
|
|
"caffe2/quantization/server/int8_gen_quant_params.cc",
|
2020-04-07 05:48:33 +00:00
|
|
|
"caffe2/quantization/server/kl_minimization.cc",
|
|
|
|
|
"caffe2/quantization/server/lstm_unit_dnnlowp_op.cc",
|
|
|
|
|
"caffe2/quantization/server/norm_minimization.cc",
|
|
|
|
|
"caffe2/quantization/server/p99.cc",
|
|
|
|
|
"caffe2/quantization/server/pool_dnnlowp_op.cc",
|
|
|
|
|
"caffe2/quantization/server/quantize_dnnlowp_op.cc",
|
|
|
|
|
"caffe2/quantization/server/relu_dnnlowp_op.cc",
|
|
|
|
|
"caffe2/quantization/server/sigmoid.cc",
|
|
|
|
|
"caffe2/quantization/server/sigmoid_dnnlowp_op.cc",
|
|
|
|
|
"caffe2/quantization/server/spatial_batch_norm_dnnlowp_op.cc",
|
|
|
|
|
"caffe2/quantization/server/tanh.cc",
|
|
|
|
|
"caffe2/quantization/server/tanh_dnnlowp_op.cc",
|
|
|
|
|
"caffe2/quantization/server/utility_dnnlowp_ops.cc",
|
|
|
|
|
],
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
filegroup(
|
|
|
|
|
name = "caffe2_queue_srcs",
|
|
|
|
|
srcs = [
|
|
|
|
|
"caffe2/queue/blobs_queue.cc",
|
|
|
|
|
"caffe2/queue/blobs_queue_db.cc",
|
|
|
|
|
"caffe2/queue/queue_ops.cc",
|
|
|
|
|
"caffe2/queue/rebatching_queue.cc",
|
|
|
|
|
"caffe2/queue/rebatching_queue_ops.cc",
|
|
|
|
|
],
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
filegroup(
|
|
|
|
|
name = "caffe2_serialize_srcs",
|
|
|
|
|
srcs = [
|
|
|
|
|
"caffe2/serialize/file_adapter.cc",
|
|
|
|
|
"caffe2/serialize/inline_container.cc",
|
|
|
|
|
"caffe2/serialize/istream_adapter.cc",
|
|
|
|
|
"caffe2/serialize/read_adapter_interface.cc",
|
|
|
|
|
],
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
filegroup(
|
|
|
|
|
name = "caffe2_sgd_srcs",
|
|
|
|
|
srcs = [
|
|
|
|
|
"caffe2/sgd/adadelta_op.cc",
|
|
|
|
|
"caffe2/sgd/adagrad_op.cc",
|
|
|
|
|
"caffe2/sgd/adam_op.cc",
|
|
|
|
|
"caffe2/sgd/clip_tensor_op.cc",
|
|
|
|
|
"caffe2/sgd/ftrl_op.cc",
|
|
|
|
|
"caffe2/sgd/gftrl_op.cc",
|
|
|
|
|
"caffe2/sgd/iter_op.cc",
|
|
|
|
|
"caffe2/sgd/lars_op.cc",
|
|
|
|
|
"caffe2/sgd/learning_rate_adaption_op.cc",
|
|
|
|
|
"caffe2/sgd/learning_rate_op.cc",
|
|
|
|
|
"caffe2/sgd/momentum_sgd_op.cc",
|
|
|
|
|
"caffe2/sgd/rmsprop_op.cc",
|
|
|
|
|
"caffe2/sgd/wngrad_op.cc",
|
|
|
|
|
"caffe2/sgd/yellowfin_op.cc",
|
|
|
|
|
],
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
filegroup(
|
|
|
|
|
name = "caffe2_transforms_srcs",
|
|
|
|
|
srcs = [
|
|
|
|
|
"caffe2/transforms/common_subexpression_elimination.cc",
|
|
|
|
|
"caffe2/transforms/conv_to_nnpack_transform.cc",
|
|
|
|
|
"caffe2/transforms/pattern_net_transform.cc",
|
|
|
|
|
"caffe2/transforms/single_op_transform.cc",
|
|
|
|
|
],
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
filegroup(
|
|
|
|
|
name = "caffe2_utils_srcs",
|
|
|
|
|
srcs = [
|
|
|
|
|
"caffe2/utils/bench_utils.cc",
|
|
|
|
|
"caffe2/utils/cpuid.cc",
|
|
|
|
|
"caffe2/utils/math/broadcast.cc",
|
|
|
|
|
"caffe2/utils/math/elementwise.cc",
|
|
|
|
|
"caffe2/utils/math/reduce.cc",
|
|
|
|
|
"caffe2/utils/math/transpose.cc",
|
|
|
|
|
"caffe2/utils/math/utils.cc",
|
|
|
|
|
"caffe2/utils/math_cpu.cc",
|
|
|
|
|
"caffe2/utils/murmur_hash3.cc",
|
|
|
|
|
"caffe2/utils/proto_convert.cc",
|
|
|
|
|
"caffe2/utils/proto_utils.cc",
|
|
|
|
|
"caffe2/utils/proto_wrap.cc",
|
|
|
|
|
"caffe2/utils/signal_handler.cc",
|
|
|
|
|
"caffe2/utils/smart_tensor_printer.cc",
|
|
|
|
|
"caffe2/utils/string_utils.cc",
|
|
|
|
|
"caffe2/utils/threadpool/ThreadPool.cc",
|
|
|
|
|
"caffe2/utils/threadpool/pthreadpool.cc",
|
|
|
|
|
"caffe2/utils/threadpool/pthreadpool_impl.cc",
|
2021-03-03 19:37:36 +00:00
|
|
|
"caffe2/utils/threadpool/thread_pool_guard.cpp",
|
2020-04-07 05:48:33 +00:00
|
|
|
],
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
filegroup(
|
|
|
|
|
name = "caffe2_cuda_srcs",
|
|
|
|
|
srcs = [
|
|
|
|
|
"caffe2/contrib/aten/aten_op_gpu.cc",
|
|
|
|
|
"caffe2/contrib/gloo/allreduce_ops_gpu.cc",
|
|
|
|
|
"caffe2/contrib/gloo/broadcast_ops_gpu.cc",
|
|
|
|
|
"caffe2/contrib/gloo/common_world_ops_gpu.cc",
|
|
|
|
|
"caffe2/core/blob_serialization_gpu.cc",
|
|
|
|
|
"caffe2/core/common_cudnn.cc",
|
|
|
|
|
"caffe2/core/common_gpu.cc",
|
|
|
|
|
"caffe2/core/event_gpu.cc",
|
|
|
|
|
"caffe2/db/create_db_op_gpu.cc",
|
|
|
|
|
"caffe2/distributed/file_store_handler_op_gpu.cc",
|
|
|
|
|
"caffe2/operators/communicator_op_gpu.cc",
|
|
|
|
|
"caffe2/operators/concat_split_op_gpu.cc",
|
|
|
|
|
"caffe2/operators/conv_op_cache_cudnn.cc",
|
|
|
|
|
"caffe2/operators/conv_op_cudnn.cc",
|
|
|
|
|
"caffe2/operators/conv_op_gpu.cc",
|
|
|
|
|
"caffe2/operators/conv_op_shared_gpu.cc",
|
|
|
|
|
"caffe2/operators/conv_transpose_op_cudnn.cc",
|
|
|
|
|
"caffe2/operators/conv_transpose_op_gpu.cc",
|
|
|
|
|
"caffe2/operators/counter_ops_gpu.cc",
|
|
|
|
|
"caffe2/operators/do_op_gpu.cc",
|
|
|
|
|
"caffe2/operators/dropout_op_cudnn.cc",
|
|
|
|
|
"caffe2/operators/elementwise_add_op_gpu.cc",
|
|
|
|
|
"caffe2/operators/elementwise_sub_op_gpu.cc",
|
|
|
|
|
"caffe2/operators/elu_op_cudnn.cc",
|
|
|
|
|
"caffe2/operators/exp_op_gpu.cc",
|
|
|
|
|
"caffe2/operators/expand_op_gpu.cc",
|
|
|
|
|
"caffe2/operators/expand_squeeze_dims_op_gpu.cc",
|
|
|
|
|
"caffe2/operators/free_op_gpu.cc",
|
|
|
|
|
"caffe2/operators/fully_connected_op_gpu.cc",
|
|
|
|
|
"caffe2/operators/if_op_gpu.cc",
|
|
|
|
|
"caffe2/operators/im2col_op_gpu.cc",
|
|
|
|
|
"caffe2/operators/load_save_op_gpu.cc",
|
|
|
|
|
"caffe2/operators/local_response_normalization_op_cudnn.cc",
|
|
|
|
|
"caffe2/operators/locally_connected_op_gpu.cc",
|
|
|
|
|
"caffe2/operators/log_op_gpu.cc",
|
|
|
|
|
"caffe2/operators/matmul_op_gpu.cc",
|
|
|
|
|
"caffe2/operators/negate_gradient_op_gpu.cc",
|
|
|
|
|
"caffe2/operators/negative_op_gpu.cc",
|
|
|
|
|
"caffe2/operators/order_switch_ops_cudnn.cc",
|
|
|
|
|
"caffe2/operators/order_switch_ops_gpu.cc",
|
|
|
|
|
"caffe2/operators/pool_op_cudnn.cc",
|
|
|
|
|
"caffe2/operators/prepend_dim_op_gpu.cc",
|
|
|
|
|
"caffe2/operators/reshape_op_gpu.cc",
|
|
|
|
|
"caffe2/operators/rnn/recurrent_network_blob_fetcher_op_gpu.cc",
|
|
|
|
|
"caffe2/operators/rnn/recurrent_network_executor_gpu.cc",
|
|
|
|
|
"caffe2/operators/rnn/recurrent_op_cudnn.cc",
|
|
|
|
|
"caffe2/operators/scale_op_gpu.cc",
|
|
|
|
|
"caffe2/operators/shape_op_gpu.cc",
|
|
|
|
|
"caffe2/operators/sigmoid_op_cudnn.cc",
|
|
|
|
|
"caffe2/operators/softmax_op_cudnn.cc",
|
|
|
|
|
"caffe2/operators/sqr_op_gpu.cc",
|
|
|
|
|
"caffe2/operators/sqrt_op_gpu.cc",
|
|
|
|
|
"caffe2/operators/stop_gradient_gpu.cc",
|
|
|
|
|
"caffe2/operators/tanh_op_cudnn.cc",
|
|
|
|
|
"caffe2/operators/tensor_protos_db_input_gpu.cc",
|
|
|
|
|
"caffe2/operators/transpose_op_cudnn.cc",
|
|
|
|
|
"caffe2/operators/while_op_gpu.cc",
|
|
|
|
|
"caffe2/operators/zero_gradient_op_gpu.cc",
|
|
|
|
|
"caffe2/queue/queue_ops_gpu.cc",
|
|
|
|
|
"caffe2/sgd/iter_op_gpu.cc",
|
|
|
|
|
"caffe2/sgd/learning_rate_op_gpu.cc",
|
|
|
|
|
],
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
filegroup(
|
|
|
|
|
name = "caffe2_cu_srcs",
|
|
|
|
|
srcs = [
|
|
|
|
|
"caffe2/core/context_gpu.cu.cc",
|
|
|
|
|
"caffe2/operators/abs_op.cu.cc",
|
|
|
|
|
"caffe2/operators/accumulate_op.cu.cc",
|
|
|
|
|
"caffe2/operators/accuracy_op.cu.cc",
|
|
|
|
|
"caffe2/operators/acos_op.cu.cc",
|
|
|
|
|
"caffe2/operators/affine_channel_op.cu.cc",
|
|
|
|
|
"caffe2/operators/alias_with_name.cu.cc",
|
|
|
|
|
"caffe2/operators/arg_ops.cu.cc",
|
|
|
|
|
"caffe2/operators/asin_op.cu.cc",
|
|
|
|
|
"caffe2/operators/assert_op.cu.cc",
|
|
|
|
|
"caffe2/operators/atan_op.cu.cc",
|
|
|
|
|
"caffe2/operators/batch_gather_ops.cu.cc",
|
|
|
|
|
"caffe2/operators/batch_matmul_op.cu.cc",
|
|
|
|
|
"caffe2/operators/batch_moments_op.cu.cc",
|
|
|
|
|
"caffe2/operators/batch_permutation_op.cu.cc",
|
|
|
|
|
"caffe2/operators/batch_sparse_to_dense_op.cu.cc",
|
|
|
|
|
"caffe2/operators/boolean_mask_ops.cu.cc",
|
|
|
|
|
"caffe2/operators/boolean_unmask_ops.cu.cc",
|
|
|
|
|
"caffe2/operators/bucketize_op.cu.cc",
|
|
|
|
|
"caffe2/operators/cast_op.cu.cc",
|
|
|
|
|
"caffe2/operators/cbrt_op.cu.cc",
|
|
|
|
|
"caffe2/operators/ceil_op.cu.cc",
|
|
|
|
|
"caffe2/operators/channel_backprop_stats_op.cu.cc",
|
|
|
|
|
"caffe2/operators/channel_shuffle_op.cu.cc",
|
|
|
|
|
"caffe2/operators/channel_stats_op.cu.cc",
|
|
|
|
|
"caffe2/operators/channelwise_conv3d_op_cudnn.cu.cc",
|
|
|
|
|
"caffe2/operators/clip_op.cu.cc",
|
|
|
|
|
"caffe2/operators/copy_op.cu.cc",
|
|
|
|
|
"caffe2/operators/cos_op.cu.cc",
|
|
|
|
|
"caffe2/operators/cosh_op.cu.cc",
|
|
|
|
|
"caffe2/operators/cosine_embedding_criterion_op.cu.cc",
|
|
|
|
|
"caffe2/operators/cross_entropy_op.cu.cc",
|
|
|
|
|
"caffe2/operators/cube_op.cu.cc",
|
|
|
|
|
"caffe2/operators/data_couple_gpu.cu.cc",
|
|
|
|
|
"caffe2/operators/deform_conv_op.cu.cc",
|
|
|
|
|
"caffe2/operators/depthwise_3x3_conv_op_cudnn.cu.cc",
|
|
|
|
|
"caffe2/operators/distance_op.cu.cc",
|
|
|
|
|
"caffe2/operators/dropout_op.cu.cc",
|
|
|
|
|
"caffe2/operators/elementwise_div_op.cu.cc",
|
|
|
|
|
"caffe2/operators/elementwise_linear_op.cu.cc",
|
|
|
|
|
"caffe2/operators/elementwise_mul_op.cu.cc",
|
|
|
|
|
"caffe2/operators/elementwise_ops.cu.cc",
|
|
|
|
|
"caffe2/operators/elu_op.cu.cc",
|
|
|
|
|
"caffe2/operators/enforce_finite_op.cu.cc",
|
|
|
|
|
"caffe2/operators/ensure_cpu_output_op.cu.cc",
|
|
|
|
|
"caffe2/operators/erf_op.cu.cc",
|
|
|
|
|
"caffe2/operators/filler_op.cu.cc",
|
|
|
|
|
"caffe2/operators/find_op.cu.cc",
|
|
|
|
|
"caffe2/operators/floor_op.cu.cc",
|
|
|
|
|
"caffe2/operators/gather_op.cu.cc",
|
|
|
|
|
"caffe2/operators/gelu_op.cu.cc",
|
|
|
|
|
"caffe2/operators/generate_proposals_op.cu.cc",
|
|
|
|
|
"caffe2/operators/generate_proposals_op_util_nms_gpu.cu.cc",
|
|
|
|
|
"caffe2/operators/given_tensor_byte_string_to_uint8_fill_op.cu.cc",
|
|
|
|
|
"caffe2/operators/given_tensor_fill_op.cu.cc",
|
|
|
|
|
"caffe2/operators/glu_op.cu.cc",
|
|
|
|
|
"caffe2/operators/group_norm_op.cu.cc",
|
|
|
|
|
"caffe2/operators/gru_unit_op_gpu.cu.cc",
|
|
|
|
|
"caffe2/operators/half_float_ops.cu.cc",
|
|
|
|
|
"caffe2/operators/hard_sigmoid_op.cu.cc",
|
|
|
|
|
"caffe2/operators/instance_norm_op.cu.cc",
|
|
|
|
|
"caffe2/operators/integral_image_op.cu.cc",
|
|
|
|
|
"caffe2/operators/layer_norm_op.cu.cc",
|
|
|
|
|
"caffe2/operators/leaky_relu_op.cu.cc",
|
|
|
|
|
"caffe2/operators/lengths_pad_op.cu.cc",
|
|
|
|
|
"caffe2/operators/lengths_tile_op.cu.cc",
|
|
|
|
|
"caffe2/operators/local_response_normalization_op.cu.cc",
|
|
|
|
|
"caffe2/operators/logit_op.cu.cc",
|
|
|
|
|
"caffe2/operators/loss_op.cu.cc",
|
|
|
|
|
"caffe2/operators/lp_pool_op.cu.cc",
|
|
|
|
|
"caffe2/operators/lstm_unit_op_gpu.cu.cc",
|
|
|
|
|
"caffe2/operators/margin_ranking_criterion_op.cu.cc",
|
|
|
|
|
"caffe2/operators/max_pool_with_index.cu.cc",
|
|
|
|
|
"caffe2/operators/mean_op.cu.cc",
|
|
|
|
|
"caffe2/operators/mem_query_op.cu.cc",
|
|
|
|
|
"caffe2/operators/minmax_ops.cu.cc",
|
|
|
|
|
"caffe2/operators/moments_op.cu.cc",
|
|
|
|
|
"caffe2/operators/multi_class_accuracy_op.cu.cc",
|
|
|
|
|
"caffe2/operators/normalize_ops.cu.cc",
|
|
|
|
|
"caffe2/operators/one_hot_ops.cu.cc",
|
|
|
|
|
"caffe2/operators/pack_segments.cu.cc",
|
|
|
|
|
"caffe2/operators/pad_op_gpu.cu.cc",
|
|
|
|
|
"caffe2/operators/perplexity_op.cu.cc",
|
|
|
|
|
"caffe2/operators/piecewise_linear_transform_op.cu.cc",
|
|
|
|
|
"caffe2/operators/pool_op.cu.cc",
|
|
|
|
|
"caffe2/operators/pow_op.cu.cc",
|
|
|
|
|
"caffe2/operators/prelu_op.cu.cc",
|
|
|
|
|
"caffe2/operators/reciprocal_op.cu.cc",
|
|
|
|
|
"caffe2/operators/reduce_front_back_max_ops.cu.cc",
|
|
|
|
|
"caffe2/operators/reduce_front_back_sum_mean_ops.cu.cc",
|
|
|
|
|
"caffe2/operators/reduce_ops.cu.cc",
|
|
|
|
|
"caffe2/operators/reduction_ops.cu.cc",
|
|
|
|
|
"caffe2/operators/relu_n_op.cu.cc",
|
|
|
|
|
"caffe2/operators/relu_op.cu.cc",
|
|
|
|
|
"caffe2/operators/replace_nan_op.cu.cc",
|
|
|
|
|
"caffe2/operators/resize_3d_op.cu.cc",
|
|
|
|
|
"caffe2/operators/resize_op.cu.cc",
|
|
|
|
|
"caffe2/operators/reverse_packed_segs_op.cu.cc",
|
|
|
|
|
"caffe2/operators/rmac_regions_op.cu.cc",
|
|
|
|
|
"caffe2/operators/rnn/recurrent_network_op_gpu.cu.cc",
|
|
|
|
|
"caffe2/operators/roi_align_gradient_op.cu.cc",
|
|
|
|
|
"caffe2/operators/roi_align_op.cu.cc",
|
|
|
|
|
"caffe2/operators/roi_align_rotated_gradient_op.cu.cc",
|
|
|
|
|
"caffe2/operators/roi_align_rotated_op.cu.cc",
|
|
|
|
|
"caffe2/operators/roi_pool_op.cu.cc",
|
|
|
|
|
"caffe2/operators/rsqrt_op.cu.cc",
|
|
|
|
|
"caffe2/operators/scale_blobs_op.cu.cc",
|
|
|
|
|
"caffe2/operators/segment_reduction_op_gpu.cu.cc",
|
|
|
|
|
"caffe2/operators/selu_op.cu.cc",
|
|
|
|
|
"caffe2/operators/sequence_ops.cu.cc",
|
|
|
|
|
"caffe2/operators/sigmoid_op.cu.cc",
|
|
|
|
|
"caffe2/operators/sin_op.cu.cc",
|
|
|
|
|
"caffe2/operators/sinh_op.cu.cc",
|
|
|
|
|
"caffe2/operators/slice_op.cu.cc",
|
|
|
|
|
"caffe2/operators/softmax_ops.cu.cc",
|
|
|
|
|
"caffe2/operators/softplus_op.cu.cc",
|
|
|
|
|
"caffe2/operators/softsign_op.cu.cc",
|
|
|
|
|
"caffe2/operators/space_batch_op_gpu.cu.cc",
|
|
|
|
|
"caffe2/operators/sparse_normalize_op_gpu.cu.cc",
|
|
|
|
|
"caffe2/operators/sparse_to_dense_op.cu.cc",
|
|
|
|
|
"caffe2/operators/spatial_batch_norm_op.cu.cc",
|
|
|
|
|
"caffe2/operators/spatial_batch_norm_op_cudnn.cu.cc",
|
|
|
|
|
"caffe2/operators/stump_func_op.cu.cc",
|
|
|
|
|
"caffe2/operators/summarize_op.cu.cc",
|
|
|
|
|
"caffe2/operators/swish_op.cu.cc",
|
|
|
|
|
"caffe2/operators/tan_op.cu.cc",
|
|
|
|
|
"caffe2/operators/tanh_op.cu.cc",
|
|
|
|
|
"caffe2/operators/thresholded_relu_op.cu.cc",
|
|
|
|
|
"caffe2/operators/tile_op.cu.cc",
|
|
|
|
|
"caffe2/operators/top_k.cu.cc",
|
|
|
|
|
"caffe2/operators/transpose_op.cu.cc",
|
|
|
|
|
"caffe2/operators/unique_ops.cu.cc",
|
|
|
|
|
"caffe2/operators/upsample_op.cu.cc",
|
|
|
|
|
"caffe2/operators/utility_ops.cu.cc",
|
|
|
|
|
"caffe2/operators/weighted_sample_op.cu.cc",
|
|
|
|
|
"caffe2/sgd/adadelta_op_gpu.cu.cc",
|
|
|
|
|
"caffe2/sgd/adagrad_op_gpu.cu.cc",
|
|
|
|
|
"caffe2/sgd/adam_op_gpu.cu.cc",
|
|
|
|
|
"caffe2/sgd/fp16_momentum_sgd_op.cu.cc",
|
|
|
|
|
"caffe2/sgd/fp32_momentum_sgd_op.cu.cc",
|
|
|
|
|
"caffe2/sgd/lars_op_gpu.cu.cc",
|
|
|
|
|
"caffe2/sgd/momentum_sgd_op_gpu.cu.cc",
|
|
|
|
|
"caffe2/sgd/rmsprop_op_gpu.cu.cc",
|
|
|
|
|
"caffe2/sgd/yellowfin_op_gpu.cu.cc",
|
|
|
|
|
"caffe2/utils/math/broadcast.cu.cc",
|
|
|
|
|
"caffe2/utils/math/elementwise.cu.cc",
|
|
|
|
|
"caffe2/utils/math/reduce.cu.cc",
|
|
|
|
|
"caffe2/utils/math/transpose.cu.cc",
|
|
|
|
|
"caffe2/utils/math_gpu.cu.cc",
|
|
|
|
|
],
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# To achieve finer granularity and make debug easier, caffe2 is split into three libraries:
|
|
|
|
|
# ATen, caffe2 and caffe2_for_aten_headers. ATen lib group up source codes under
|
|
|
|
|
# aten/ directory and caffe2 contains most files under `caffe2/` directory. Since the
|
|
|
|
|
# ATen lib and the caffe2 lib would depend on each other, `caffe2_for_aten_headers` is splitted
|
|
|
|
|
# out from `caffe2` to avoid dependency cycle.
|
|
|
|
|
cc_library(
|
|
|
|
|
name = "caffe2_for_aten_headers",
|
|
|
|
|
hdrs = [
|
|
|
|
|
"caffe2/core/common.h",
|
|
|
|
|
"caffe2/core/logging.h",
|
|
|
|
|
"caffe2/core/types.h",
|
|
|
|
|
"caffe2/perfkernels/common.h",
|
|
|
|
|
"caffe2/perfkernels/embedding_lookup.h",
|
|
|
|
|
"caffe2/perfkernels/embedding_lookup_idx.h",
|
|
|
|
|
"caffe2/utils/fixed_divisor.h",
|
|
|
|
|
"caffe2/utils/cpuid.h",
|
|
|
|
|
] + glob([
|
|
|
|
|
"caffe2/utils/threadpool/*.h",
|
|
|
|
|
"caffe2/proto/*.h",
|
|
|
|
|
]),
|
|
|
|
|
copts = CAFFE2_COPTS,
|
|
|
|
|
visibility = ["//visibility:public"],
|
|
|
|
|
deps = [
|
|
|
|
|
":c10_headers",
|
|
|
|
|
":caffe2_protos",
|
2020-08-27 19:09:34 +00:00
|
|
|
":caffe2_core_macros_h",
|
2020-04-07 05:48:33 +00:00
|
|
|
],
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
cc_library(
|
|
|
|
|
name = "caffe2_headers",
|
|
|
|
|
hdrs = glob([
|
|
|
|
|
"caffe2/contrib/aten/*.h",
|
|
|
|
|
"caffe2/contrib/gloo/*.h",
|
|
|
|
|
"caffe2/core/*.h",
|
|
|
|
|
"caffe2/core/nomnigraph/include/nomnigraph/Converters/*.h",
|
|
|
|
|
"caffe2/core/nomnigraph/include/nomnigraph/Generated/*.h",
|
|
|
|
|
"caffe2/core/nomnigraph/include/nomnigraph/Graph/*.h",
|
|
|
|
|
"caffe2/core/nomnigraph/include/nomnigraph/Representations/*.h",
|
|
|
|
|
"caffe2/core/nomnigraph/include/nomnigraph/Support/*.h",
|
|
|
|
|
"caffe2/core/nomnigraph/include/nomnigraph/Transformations/*.h",
|
|
|
|
|
"caffe2/core/nomnigraph/tests/*.h",
|
|
|
|
|
"caffe2/db/*.h",
|
|
|
|
|
"caffe2/distributed/*.h",
|
|
|
|
|
"caffe2/ideep/*.h",
|
|
|
|
|
"caffe2/ideep/operators/*.h",
|
|
|
|
|
"caffe2/ideep/operators/quantization/*.h",
|
|
|
|
|
"caffe2/ideep/utils/*.h",
|
|
|
|
|
"caffe2/onnx/*.h",
|
|
|
|
|
"caffe2/operators/*.h",
|
|
|
|
|
"caffe2/operators/rnn/*.h",
|
|
|
|
|
"caffe2/opt/*.h",
|
|
|
|
|
"caffe2/perfkernels/*.h",
|
|
|
|
|
"caffe2/predictor/*.h",
|
|
|
|
|
"caffe2/predictor/emulator/*.h",
|
|
|
|
|
"caffe2/proto/*.h",
|
|
|
|
|
"caffe2/quantization/server/*.h",
|
|
|
|
|
"caffe2/queue/*.h",
|
|
|
|
|
"caffe2/serialize/*.h",
|
|
|
|
|
"caffe2/sgd/*.h",
|
|
|
|
|
"caffe2/share/contrib/depthwise/*.h",
|
|
|
|
|
"caffe2/transforms/*.h",
|
|
|
|
|
"caffe2/utils/*.h",
|
|
|
|
|
"caffe2/utils/math/*.h",
|
|
|
|
|
"caffe2/utils/threadpool/*.h",
|
|
|
|
|
"modules/**/*.h",
|
|
|
|
|
]) + if_cuda(glob([
|
|
|
|
|
"caffe2/**/*.cuh",
|
|
|
|
|
"caffe2/image/*.h",
|
|
|
|
|
])),
|
|
|
|
|
copts = CAFFE2_COPTS,
|
|
|
|
|
includes = [
|
|
|
|
|
"caffe2/contrib/aten",
|
|
|
|
|
"caffe2/core/nomnigraph/include",
|
|
|
|
|
"third_party/miniz-2.0.8",
|
|
|
|
|
],
|
|
|
|
|
visibility = ["//visibility:public"],
|
|
|
|
|
deps = [
|
|
|
|
|
":caffe2_for_aten_headers",
|
|
|
|
|
":caffe2_protos",
|
|
|
|
|
],
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
cc_library(
|
|
|
|
|
name = "caffe2_dnnlowp_avx2_ops",
|
|
|
|
|
srcs = [
|
|
|
|
|
"caffe2/quantization/server/elementwise_sum_dnnlowp_op_avx2.cc",
|
|
|
|
|
"caffe2/quantization/server/fully_connected_fake_lowp_op_avx2.cc",
|
|
|
|
|
"caffe2/quantization/server/group_norm_dnnlowp_op_avx2.cc",
|
|
|
|
|
"caffe2/quantization/server/norm_minimization_avx2.cc",
|
|
|
|
|
"caffe2/quantization/server/pool_dnnlowp_op_avx2.cc",
|
|
|
|
|
"caffe2/quantization/server/relu_dnnlowp_op_avx2.cc",
|
|
|
|
|
"caffe2/quantization/server/spatial_batch_norm_dnnlowp_op_avx2.cc",
|
|
|
|
|
"caffe2/quantization/server/transpose.cc",
|
|
|
|
|
],
|
|
|
|
|
copts = CAFFE2_COPTS + [
|
|
|
|
|
"-mf16c",
|
|
|
|
|
"-mavx2",
|
|
|
|
|
"-mfma",
|
|
|
|
|
"-mxsave",
|
|
|
|
|
],
|
|
|
|
|
visibility = ["//visibility:public"],
|
|
|
|
|
deps = [
|
|
|
|
|
":caffe2_headers",
|
|
|
|
|
"@fbgemm",
|
|
|
|
|
],
|
|
|
|
|
alwayslink = True,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
cc_library(
|
|
|
|
|
name = "caffe2",
|
|
|
|
|
srcs = [
|
|
|
|
|
"caffe2/db/create_db_op.cc",
|
|
|
|
|
"caffe2/db/protodb.cc",
|
|
|
|
|
"caffe2/share/contrib/depthwise/depthwise3x3_conv_op.cc",
|
|
|
|
|
":caffe2_contrib_srcs",
|
|
|
|
|
":caffe2_core_srcs",
|
|
|
|
|
":caffe2_distributed_srcs",
|
|
|
|
|
":caffe2_ideep_srcs",
|
|
|
|
|
":caffe2_onnx_srcs",
|
|
|
|
|
":caffe2_operators_srcs",
|
|
|
|
|
":caffe2_opt_srcs",
|
|
|
|
|
":caffe2_perfkernels_srcs",
|
|
|
|
|
":caffe2_predictor_srcs",
|
|
|
|
|
":caffe2_quantization_srcs",
|
|
|
|
|
":caffe2_queue_srcs",
|
|
|
|
|
":caffe2_serialize_srcs",
|
|
|
|
|
":caffe2_sgd_srcs",
|
|
|
|
|
":caffe2_transforms_srcs",
|
|
|
|
|
":caffe2_utils_srcs",
|
|
|
|
|
],
|
|
|
|
|
copts = CAFFE2_COPTS + ["-mf16c"],
|
|
|
|
|
linkstatic = 1,
|
|
|
|
|
visibility = ["//visibility:public"],
|
|
|
|
|
deps = [
|
|
|
|
|
":caffe2_headers",
|
|
|
|
|
":caffe2_dnnlowp_avx2_ops",
|
|
|
|
|
":caffe2_perfkernels_avx",
|
|
|
|
|
":caffe2_perfkernels_avx2",
|
|
|
|
|
":caffe2_perfkernels_avx512",
|
|
|
|
|
":caffe2_protos",
|
|
|
|
|
"//third_party/miniz-2.0.8:miniz",
|
|
|
|
|
"@com_google_protobuf//:protobuf",
|
|
|
|
|
"@eigen",
|
2020-04-13 22:59:48 +00:00
|
|
|
"@fbgemm//:fbgemm_src_headers",
|
2020-04-07 05:48:33 +00:00
|
|
|
"@foxi",
|
|
|
|
|
"@gloo",
|
|
|
|
|
"@onnx",
|
2020-04-29 16:03:31 +00:00
|
|
|
"@fmt",
|
2020-04-07 05:48:33 +00:00
|
|
|
] + if_cuda(
|
|
|
|
|
[
|
|
|
|
|
":caffe2_cpp_cuda",
|
|
|
|
|
":aten_cuda",
|
2020-09-21 17:16:40 +00:00
|
|
|
"@tensorpipe//:tensorpipe_cuda",
|
|
|
|
|
],
|
|
|
|
|
[
|
|
|
|
|
":aten",
|
2021-06-18 12:12:27 +00:00
|
|
|
"@tensorpipe//:tensorpipe_cpu",
|
2020-04-07 05:48:33 +00:00
|
|
|
],
|
|
|
|
|
),
|
|
|
|
|
alwayslink = True,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
cc_library(
|
|
|
|
|
name = "caffe2_cpp_cuda",
|
|
|
|
|
srcs = [":caffe2_cuda_srcs"],
|
|
|
|
|
copts = CAFFE2_COPTS,
|
|
|
|
|
visibility = ["//visibility:public"],
|
|
|
|
|
deps = [
|
|
|
|
|
":caffe2_cuda",
|
|
|
|
|
":caffe2_headers",
|
|
|
|
|
],
|
|
|
|
|
alwayslink = True,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
cu_library(
|
|
|
|
|
name = "caffe2_cuda",
|
|
|
|
|
srcs = [":caffe2_cu_srcs"],
|
|
|
|
|
copts = CAFFE2_COPTS + torch_cuda_half_options,
|
|
|
|
|
visibility = ["//visibility:public"],
|
|
|
|
|
deps = [
|
|
|
|
|
":aten",
|
|
|
|
|
":caffe2_headers",
|
|
|
|
|
"@cub",
|
|
|
|
|
"@cuda//:cublas",
|
|
|
|
|
"@cuda//:curand",
|
|
|
|
|
"@cudnn",
|
|
|
|
|
"@eigen",
|
|
|
|
|
"@gloo",
|
2020-09-21 17:16:40 +00:00
|
|
|
"@tensorpipe//:tensorpipe_cuda",
|
2020-04-07 05:48:33 +00:00
|
|
|
],
|
|
|
|
|
alwayslink = True,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
PERF_COPTS = [
|
|
|
|
|
"-DHAVE_GCC_GET_CPUID",
|
|
|
|
|
"-DUSE_AVX",
|
|
|
|
|
"-DUSE_AVX2",
|
|
|
|
|
"-DTH_HAVE_THREAD",
|
|
|
|
|
"-DHAVE_AVX_CPU_DEFINITION",
|
|
|
|
|
"-DHAVE_AVX2_CPU_DEFINITION",
|
|
|
|
|
"-DENABLE_ALIAS=1",
|
|
|
|
|
"-DHAVE_MALLOC_USABLE_SIZE=1",
|
|
|
|
|
"-DHAVE_MMAP=1",
|
|
|
|
|
"-DHAVE_SHM_OPEN=1",
|
|
|
|
|
"-DHAVE_SHM_UNLINK=1",
|
|
|
|
|
"-DSLEEF_STATIC_LIBS=1",
|
|
|
|
|
"-D_FILE_OFFSET_BITS=64",
|
|
|
|
|
"-DUSE_FBGEMM",
|
|
|
|
|
"-fvisibility-inlines-hidden",
|
|
|
|
|
"-Wunused-parameter",
|
|
|
|
|
"-fno-math-errno",
|
|
|
|
|
"-fno-trapping-math",
|
|
|
|
|
"-mf16c",
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
PERF_HEADERS = glob([
|
|
|
|
|
"caffe2/perfkernels/*.h",
|
|
|
|
|
"caffe2/core/*.h",
|
|
|
|
|
])
|
|
|
|
|
|
|
|
|
|
cc_library(
|
|
|
|
|
name = "caffe2_perfkernels_avx",
|
|
|
|
|
srcs = glob([
|
|
|
|
|
"caffe2/perfkernels/*_avx.cc",
|
|
|
|
|
]),
|
|
|
|
|
hdrs = PERF_HEADERS,
|
|
|
|
|
copts = PERF_COPTS + [
|
|
|
|
|
"-mavx",
|
|
|
|
|
],
|
|
|
|
|
visibility = ["//visibility:public"],
|
|
|
|
|
deps = [
|
|
|
|
|
":caffe2_headers",
|
|
|
|
|
":c10",
|
|
|
|
|
],
|
|
|
|
|
alwayslink = True,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
cc_library(
|
|
|
|
|
name = "caffe2_perfkernels_avx2",
|
|
|
|
|
srcs = glob([
|
|
|
|
|
"caffe2/perfkernels/*_avx2.cc",
|
|
|
|
|
]),
|
|
|
|
|
hdrs = PERF_HEADERS,
|
|
|
|
|
copts = PERF_COPTS + [
|
|
|
|
|
"-mavx2",
|
|
|
|
|
"-mfma",
|
|
|
|
|
"-mavx",
|
|
|
|
|
],
|
|
|
|
|
visibility = ["//visibility:public"],
|
|
|
|
|
deps = [
|
|
|
|
|
":caffe2_headers",
|
|
|
|
|
":c10",
|
|
|
|
|
],
|
|
|
|
|
alwayslink = True,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
cc_library(
|
|
|
|
|
name = "caffe2_perfkernels_avx512",
|
|
|
|
|
srcs = [
|
|
|
|
|
"caffe2/perfkernels/common_avx512.cc",
|
|
|
|
|
],
|
|
|
|
|
hdrs = PERF_HEADERS,
|
|
|
|
|
copts = PERF_COPTS + [
|
|
|
|
|
"-mavx512f",
|
|
|
|
|
"-mavx512dq",
|
|
|
|
|
"-mavx512vl",
|
|
|
|
|
"-mavx2",
|
|
|
|
|
"-mfma",
|
|
|
|
|
"-mavx",
|
|
|
|
|
],
|
|
|
|
|
visibility = ["//visibility:public"],
|
|
|
|
|
deps = [
|
|
|
|
|
":caffe2_headers",
|
|
|
|
|
":c10",
|
|
|
|
|
],
|
|
|
|
|
alwayslink = True,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# torch
|
2021-02-04 06:16:04 +00:00
|
|
|
py_binary(
|
|
|
|
|
name = "gen_version_header",
|
|
|
|
|
srcs = ["tools/setup_helpers/gen_version_header.py"],
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
genrule(
|
|
|
|
|
name = "version_h",
|
|
|
|
|
srcs = ["torch/csrc/api/include/torch/version.h.in", "version.txt"],
|
|
|
|
|
outs = ["torch/csrc/api/include/torch/version.h"],
|
|
|
|
|
cmd = "$(location :gen_version_header) --template-path $(location torch/csrc/api/include/torch/version.h.in) --version-path $(location version.txt) --output-path $@",
|
|
|
|
|
tools = [':gen_version_header']
|
|
|
|
|
)
|
|
|
|
|
|
2020-05-19 05:00:55 +00:00
|
|
|
torch_cuda_headers = glob(["torch/csrc/cuda/*.h"])
|
2020-04-07 05:48:33 +00:00
|
|
|
cc_library(
|
|
|
|
|
name = "torch_headers",
|
|
|
|
|
hdrs = if_cuda(
|
2020-05-19 05:00:55 +00:00
|
|
|
torch_cuda_headers,
|
2020-04-07 05:48:33 +00:00
|
|
|
) + glob(
|
|
|
|
|
[
|
|
|
|
|
"torch/*.h",
|
2020-05-19 05:00:55 +00:00
|
|
|
"torch/csrc/**/*.h",
|
2021-06-24 19:37:29 +00:00
|
|
|
"torch/csrc/distributed/c10d/*.hpp",
|
2020-04-07 05:48:33 +00:00
|
|
|
"torch/lib/libshm/*.h",
|
|
|
|
|
],
|
|
|
|
|
exclude = [
|
2020-10-15 03:03:37 +00:00
|
|
|
"torch/csrc/autograd/generated/VariableType.h",
|
|
|
|
|
"torch/csrc/autograd/generated/RegistrationDeclarations.h",
|
|
|
|
|
"torch/csrc/autograd/generated/variable_factories.h",
|
|
|
|
|
"torch/csrc/autograd/generated/Functions.h",
|
2020-05-19 05:00:55 +00:00
|
|
|
] + torch_cuda_headers,
|
2021-02-04 06:16:04 +00:00
|
|
|
) + [":cpp_generated_code", ":version_h"],
|
2020-04-07 05:48:33 +00:00
|
|
|
includes = [
|
|
|
|
|
"torch/csrc",
|
|
|
|
|
"torch/csrc/api/include",
|
2021-06-24 19:37:29 +00:00
|
|
|
"torch/csrc/distributed",
|
2020-04-07 05:48:33 +00:00
|
|
|
"torch/lib",
|
|
|
|
|
"torch/lib/libshm",
|
|
|
|
|
],
|
|
|
|
|
visibility = ["//visibility:public"],
|
|
|
|
|
deps = [
|
|
|
|
|
":aten_headers",
|
|
|
|
|
":c10_headers",
|
|
|
|
|
":caffe2_headers",
|
2020-04-15 04:45:30 +00:00
|
|
|
"@local_config_python//:python_headers",
|
2020-04-07 05:48:33 +00:00
|
|
|
"@onnx",
|
|
|
|
|
],
|
|
|
|
|
alwayslink = True,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
TORCH_COPTS = COMMON_COPTS + [
|
|
|
|
|
"-Dtorch_EXPORTS",
|
|
|
|
|
"-DHAVE_AVX_CPU_DEFINITION",
|
|
|
|
|
"-DHAVE_AVX2_CPU_DEFINITION",
|
|
|
|
|
"-DCAFFE2_USE_GLOO",
|
|
|
|
|
"-fvisibility-inlines-hidden",
|
|
|
|
|
"-fno-math-errno ",
|
|
|
|
|
"-fno-trapping-math",
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
cc_library(
|
|
|
|
|
name = "torch",
|
|
|
|
|
srcs = if_cuda(glob(
|
|
|
|
|
[
|
|
|
|
|
"torch/csrc/cuda/*.cpp",
|
|
|
|
|
"torch/csrc/autograd/functions/comm.cpp",
|
|
|
|
|
],
|
|
|
|
|
exclude = [
|
|
|
|
|
"torch/csrc/cuda/python_nccl.cpp",
|
|
|
|
|
"torch/csrc/cuda/nccl.cpp",
|
|
|
|
|
],
|
2020-06-28 04:58:27 +00:00
|
|
|
)) + libtorch_core_sources + libtorch_distributed_sources + torch_cpp_srcs + libtorch_extra_sources + jit_core_sources + [
|
2020-05-20 05:49:28 +00:00
|
|
|
":cpp_generated_code",
|
2020-04-07 05:48:33 +00:00
|
|
|
],
|
|
|
|
|
copts = TORCH_COPTS + if_cuda(["-DUSE_CUDA=1"]),
|
|
|
|
|
defines = [
|
|
|
|
|
"CAFFE2_NIGHTLY_VERSION=20200115",
|
|
|
|
|
],
|
|
|
|
|
visibility = ["//visibility:public"],
|
|
|
|
|
deps = [
|
|
|
|
|
":caffe2",
|
|
|
|
|
":torch_headers",
|
|
|
|
|
],
|
|
|
|
|
alwayslink = True,
|
|
|
|
|
)
|
|
|
|
|
|
2020-05-20 05:49:28 +00:00
|
|
|
cc_library(
|
|
|
|
|
name = "shm",
|
|
|
|
|
srcs = glob(["torch/lib/libshm/*.cpp"]),
|
|
|
|
|
deps = [
|
|
|
|
|
":torch",
|
|
|
|
|
],
|
|
|
|
|
)
|
|
|
|
|
|
2020-04-07 05:48:33 +00:00
|
|
|
cc_library(
|
|
|
|
|
name = "libtorch_headers",
|
|
|
|
|
hdrs = glob([
|
|
|
|
|
"**/*.h",
|
|
|
|
|
"**/*.cuh",
|
|
|
|
|
]) + [
|
|
|
|
|
":generated_code",
|
|
|
|
|
],
|
|
|
|
|
includes = [
|
|
|
|
|
".",
|
|
|
|
|
"torch/csrc/api/include",
|
2021-06-24 19:37:29 +00:00
|
|
|
"torch/csrc/distributed",
|
2020-04-07 05:48:33 +00:00
|
|
|
"torch/lib",
|
|
|
|
|
"torch/lib/libshm",
|
|
|
|
|
],
|
|
|
|
|
visibility = ["//visibility:public"],
|
|
|
|
|
deps = [
|
|
|
|
|
":aten_headers",
|
|
|
|
|
":c10_headers",
|
|
|
|
|
":caffe2_headers",
|
|
|
|
|
],
|
|
|
|
|
)
|
|
|
|
|
|
2020-05-20 05:49:28 +00:00
|
|
|
cc_library(
|
|
|
|
|
name = "torch_python",
|
|
|
|
|
srcs = libtorch_python_core_sources + [":python_generated_code"],
|
|
|
|
|
hdrs = glob([
|
|
|
|
|
"torch/csrc/generic/*.cpp",
|
|
|
|
|
]),
|
|
|
|
|
deps = [
|
|
|
|
|
":torch",
|
|
|
|
|
":shm",
|
|
|
|
|
],
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
pybind_extension(
|
|
|
|
|
name = "_C",
|
2020-06-02 20:07:55 +00:00
|
|
|
srcs = ["torch/csrc/stub.c"],
|
2020-05-20 05:49:28 +00:00
|
|
|
deps = [
|
|
|
|
|
":torch_python"
|
|
|
|
|
],
|
|
|
|
|
)
|
|
|
|
|
|
2020-04-07 05:48:33 +00:00
|
|
|
# cpp api tests
|
|
|
|
|
cc_library(
|
|
|
|
|
name = "test_support",
|
|
|
|
|
testonly = True,
|
|
|
|
|
srcs = [
|
|
|
|
|
"test/cpp/api/support.cpp",
|
|
|
|
|
],
|
|
|
|
|
hdrs = [
|
2020-05-19 05:00:55 +00:00
|
|
|
"test/cpp/api/init_baseline.h",
|
|
|
|
|
"test/cpp/api/optim_baseline.h",
|
2020-04-07 05:48:33 +00:00
|
|
|
"test/cpp/api/support.h",
|
|
|
|
|
"test/cpp/common/support.h",
|
|
|
|
|
],
|
|
|
|
|
deps = [
|
|
|
|
|
":torch",
|
|
|
|
|
"@com_google_googletest//:gtest_main",
|
|
|
|
|
],
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# Torch integration tests rely on a labeled data set from the MNIST database.
|
|
|
|
|
# http://yann.lecun.com/exdb/mnist/
|
|
|
|
|
|
2020-05-19 05:00:55 +00:00
|
|
|
cpp_api_tests = glob(["test/cpp/api/*.cpp"])
|
|
|
|
|
[
|
|
|
|
|
cc_test(
|
|
|
|
|
name = paths.split_extension(paths.basename(filename))[0].replace("-","_") + "_test",
|
|
|
|
|
size = "medium",
|
|
|
|
|
srcs = [filename],
|
|
|
|
|
deps = [
|
|
|
|
|
":test_support",
|
|
|
|
|
"@com_google_googletest//:gtest_main",
|
|
|
|
|
],
|
|
|
|
|
) for filename in cpp_api_tests
|
|
|
|
|
]
|
2020-04-07 05:48:33 +00:00
|
|
|
|
|
|
|
|
test_suite(
|
|
|
|
|
name = "api_tests",
|
|
|
|
|
tests = [
|
|
|
|
|
"any_test",
|
|
|
|
|
"autograd_test",
|
|
|
|
|
"dataloader_test",
|
|
|
|
|
"enum_test",
|
|
|
|
|
"expanding_array_test",
|
|
|
|
|
"functional_test",
|
|
|
|
|
"init_test",
|
|
|
|
|
"integration_test",
|
|
|
|
|
"jit_test",
|
|
|
|
|
"memory_test",
|
|
|
|
|
"misc_test",
|
|
|
|
|
"module_test",
|
|
|
|
|
"modulelist_test",
|
|
|
|
|
"modules_test",
|
|
|
|
|
"nn_utils_test",
|
|
|
|
|
"optim_test",
|
|
|
|
|
"ordered_dict_test",
|
|
|
|
|
"rnn_test",
|
|
|
|
|
"sequential_test",
|
|
|
|
|
"serialize_test",
|
|
|
|
|
"static_test",
|
|
|
|
|
"tensor_options_test",
|
|
|
|
|
"tensor_test",
|
|
|
|
|
"torch_include_test",
|
|
|
|
|
],
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# dist autograd tests
|
|
|
|
|
cc_test(
|
|
|
|
|
name = "torch_dist_autograd_test",
|
|
|
|
|
size = "small",
|
|
|
|
|
srcs = ["test/cpp/dist_autograd/test_dist_autograd.cpp"],
|
|
|
|
|
tags = [
|
|
|
|
|
"exclusive",
|
|
|
|
|
"gpu-required",
|
|
|
|
|
],
|
|
|
|
|
deps = [
|
|
|
|
|
":torch",
|
|
|
|
|
"@com_google_googletest//:gtest_main",
|
|
|
|
|
],
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# jit tests
|
|
|
|
|
# Because these individual unit tests require custom registering,
|
|
|
|
|
# it is easier to mimic the cmake build by globing together a single test.
|
|
|
|
|
cc_test(
|
|
|
|
|
name = "jit_tests",
|
|
|
|
|
size = "small",
|
|
|
|
|
srcs = glob([
|
|
|
|
|
"test/cpp/jit/*.cpp",
|
|
|
|
|
"test/cpp/jit/*.h",
|
|
|
|
|
"test/cpp/tensorexpr/*.cpp",
|
|
|
|
|
"test/cpp/tensorexpr/*.h",
|
|
|
|
|
]),
|
|
|
|
|
linkstatic = True,
|
|
|
|
|
tags = [
|
|
|
|
|
"exclusive",
|
|
|
|
|
"gpu-required",
|
|
|
|
|
],
|
|
|
|
|
deps = [
|
|
|
|
|
":torch",
|
|
|
|
|
"@com_google_googletest//:gtest_main",
|
|
|
|
|
],
|
|
|
|
|
)
|
|
|
|
|
|
2021-10-07 17:03:02 +00:00
|
|
|
cc_test(
|
|
|
|
|
name = "lazy_tests",
|
|
|
|
|
size = "small",
|
|
|
|
|
srcs = glob([
|
|
|
|
|
"test/cpp/lazy/*.cpp",
|
|
|
|
|
"test/cpp/lazy/*.h",
|
|
|
|
|
]),
|
|
|
|
|
linkstatic = True,
|
|
|
|
|
tags = [
|
|
|
|
|
"exclusive",
|
|
|
|
|
],
|
|
|
|
|
deps = [
|
|
|
|
|
":torch",
|
|
|
|
|
"@com_google_googletest//:gtest_main",
|
|
|
|
|
],
|
|
|
|
|
)
|
|
|
|
|
|
2020-04-07 05:48:33 +00:00
|
|
|
# all tests
|
|
|
|
|
test_suite(
|
|
|
|
|
name = "all_tests",
|
|
|
|
|
tests = [
|
|
|
|
|
"api_tests",
|
|
|
|
|
"c10_tests",
|
|
|
|
|
"jit_tests",
|
|
|
|
|
"torch_dist_autograd_test",
|
|
|
|
|
],
|
|
|
|
|
)
|