2020-05-19 05:00:55 +00:00
|
|
|
load("@bazel_skylib//lib:paths.bzl", "paths")
|
2020-05-20 05:49:28 +00:00
|
|
|
load("@pybind11_bazel//:build_defs.bzl", "pybind_extension")
|
2023-03-26 10:27:21 +00:00
|
|
|
load("@rules_cc//cc:defs.bzl", "cc_binary", "cc_library", "cc_test")
|
2023-05-23 06:20:33 +00:00
|
|
|
load("@rules_python//python:defs.bzl", "py_library", "py_test")
|
2023-05-12 19:43:56 +00:00
|
|
|
load("@pytorch//third_party:substitution.bzl", "header_template_rule", "template_rule")
|
2022-12-22 05:14:55 +00:00
|
|
|
load("@pytorch//:tools/bazel.bzl", "rules")
|
|
|
|
|
load("@pytorch//tools/rules:cu.bzl", "cu_library")
|
|
|
|
|
load("@pytorch//tools/config:defs.bzl", "if_cuda")
|
2023-03-26 10:27:18 +00:00
|
|
|
load("@pytorch//:aten.bzl", "generate_aten", "intern_build_aten_ops")
|
|
|
|
|
load(":build.bzl", "GENERATED_AUTOGRAD_CPP", "GENERATED_AUTOGRAD_PYTHON", "define_targets")
|
2023-11-01 20:53:14 +00:00
|
|
|
load(":build_variables.bzl", "jit_core_sources", "lazy_tensor_ts_sources", "libtorch_core_sources", "libtorch_cuda_sources", "libtorch_distributed_sources", "libtorch_extra_sources", "libtorch_python_core_sources", "torch_cpp_srcs", "libtorch_python_cuda_sources", "libtorch_python_distributed_sources")
|
2023-03-26 10:27:18 +00:00
|
|
|
load(":ufunc_defs.bzl", "aten_ufunc_generated_cpu_kernel_sources", "aten_ufunc_generated_cpu_sources", "aten_ufunc_generated_cuda_sources")
|
2023-05-12 19:43:56 +00:00
|
|
|
load("//:tools/bazel.bzl", "rules")
|
2022-04-25 16:04:57 +00:00
|
|
|
|
|
|
|
|
define_targets(rules = rules)
|
2020-04-07 05:48:33 +00:00
|
|
|
|
|
|
|
|
COMMON_COPTS = [
|
|
|
|
|
"-DHAVE_MALLOC_USABLE_SIZE=1",
|
|
|
|
|
"-DHAVE_MMAP=1",
|
|
|
|
|
"-DHAVE_SHM_OPEN=1",
|
|
|
|
|
"-DHAVE_SHM_UNLINK=1",
|
|
|
|
|
"-D_FILE_OFFSET_BITS=64",
|
|
|
|
|
"-DUSE_FBGEMM",
|
|
|
|
|
"-DUSE_DISTRIBUTED",
|
2021-12-14 14:38:39 +00:00
|
|
|
"-DAT_PER_OPERATOR_HEADERS",
|
2020-04-07 05:48:33 +00:00
|
|
|
"-DATEN_THREADING=NATIVE",
|
|
|
|
|
"-DNO_CUDNN_DESTROY_HANDLE",
|
|
|
|
|
] + if_cuda([
|
|
|
|
|
"-DUSE_CUDA",
|
|
|
|
|
"-DUSE_CUDNN",
|
2022-08-31 23:02:42 +00:00
|
|
|
# TODO: This should be passed only when building for CUDA-11.5 or newer
|
|
|
|
|
# use cub in a safe manner, see:
|
|
|
|
|
# https://github.com/pytorch/pytorch/pull/55292
|
|
|
|
|
"-DCUB_WRAPPED_NAMESPACE=at_cuda_detail",
|
2020-04-07 05:48:33 +00:00
|
|
|
])
|
|
|
|
|
|
2022-04-28 01:36:34 +00:00
|
|
|
aten_generation_srcs = ["aten/src/ATen/native/native_functions.yaml"] + ["aten/src/ATen/native/tags.yaml"] + glob(["aten/src/ATen/templates/**"])
|
2021-12-17 21:41:24 +00:00
|
|
|
|
2022-01-05 14:55:27 +00:00
|
|
|
generated_cpu_cpp = [
|
|
|
|
|
"aten/src/ATen/RegisterBackendSelect.cpp",
|
|
|
|
|
"aten/src/ATen/RegisterCPU.cpp",
|
|
|
|
|
"aten/src/ATen/RegisterFunctionalization_0.cpp",
|
|
|
|
|
"aten/src/ATen/RegisterFunctionalization_1.cpp",
|
|
|
|
|
"aten/src/ATen/RegisterFunctionalization_2.cpp",
|
|
|
|
|
"aten/src/ATen/RegisterFunctionalization_3.cpp",
|
|
|
|
|
# "aten/src/ATen/RegisterFunctionalizationEverything.cpp",
|
|
|
|
|
"aten/src/ATen/RegisterMkldnnCPU.cpp",
|
2022-04-19 18:00:44 +00:00
|
|
|
"aten/src/ATen/RegisterNestedTensorCPU.cpp",
|
2022-01-05 14:55:27 +00:00
|
|
|
"aten/src/ATen/RegisterQuantizedCPU.cpp",
|
|
|
|
|
"aten/src/ATen/RegisterSparseCPU.cpp",
|
|
|
|
|
"aten/src/ATen/RegisterSparseCsrCPU.cpp",
|
|
|
|
|
"aten/src/ATen/RegisterZeroTensor.cpp",
|
|
|
|
|
"aten/src/ATen/RegisterCompositeImplicitAutograd.cpp",
|
2022-09-01 20:01:39 +00:00
|
|
|
"aten/src/ATen/RegisterCompositeImplicitAutogradNestedTensor.cpp",
|
2022-01-05 14:55:27 +00:00
|
|
|
"aten/src/ATen/RegisterCompositeExplicitAutograd.cpp",
|
2022-06-15 15:34:00 +00:00
|
|
|
"aten/src/ATen/RegisterCompositeExplicitAutogradNonFunctional.cpp",
|
2022-01-05 14:55:27 +00:00
|
|
|
"aten/src/ATen/RegisterMeta.cpp",
|
2022-07-20 18:17:24 +00:00
|
|
|
"aten/src/ATen/RegisterSparseMeta.cpp",
|
|
|
|
|
"aten/src/ATen/RegisterQuantizedMeta.cpp",
|
|
|
|
|
"aten/src/ATen/RegisterNestedTensorMeta.cpp",
|
2022-01-05 14:55:27 +00:00
|
|
|
"aten/src/ATen/RegisterSchema.cpp",
|
|
|
|
|
"aten/src/ATen/CPUFunctions.h",
|
|
|
|
|
"aten/src/ATen/CPUFunctions_inl.h",
|
|
|
|
|
"aten/src/ATen/CompositeExplicitAutogradFunctions.h",
|
|
|
|
|
"aten/src/ATen/CompositeExplicitAutogradFunctions_inl.h",
|
2022-06-15 15:34:00 +00:00
|
|
|
"aten/src/ATen/CompositeExplicitAutogradNonFunctionalFunctions.h",
|
|
|
|
|
"aten/src/ATen/CompositeExplicitAutogradNonFunctionalFunctions_inl.h",
|
2022-01-05 14:55:27 +00:00
|
|
|
"aten/src/ATen/CompositeImplicitAutogradFunctions.h",
|
|
|
|
|
"aten/src/ATen/CompositeImplicitAutogradFunctions_inl.h",
|
2022-09-01 20:01:39 +00:00
|
|
|
"aten/src/ATen/CompositeImplicitAutogradNestedTensorFunctions.h",
|
|
|
|
|
"aten/src/ATen/CompositeImplicitAutogradNestedTensorFunctions_inl.h",
|
2022-04-11 19:38:14 +00:00
|
|
|
"aten/src/ATen/CompositeViewCopyKernels.cpp",
|
2022-01-05 14:55:27 +00:00
|
|
|
"aten/src/ATen/FunctionalInverses.h",
|
|
|
|
|
"aten/src/ATen/Functions.h",
|
|
|
|
|
"aten/src/ATen/Functions.cpp",
|
|
|
|
|
"aten/src/ATen/RedispatchFunctions.h",
|
|
|
|
|
"aten/src/ATen/Operators.h",
|
|
|
|
|
"aten/src/ATen/Operators_0.cpp",
|
|
|
|
|
"aten/src/ATen/Operators_1.cpp",
|
|
|
|
|
"aten/src/ATen/Operators_2.cpp",
|
|
|
|
|
"aten/src/ATen/Operators_3.cpp",
|
|
|
|
|
"aten/src/ATen/Operators_4.cpp",
|
|
|
|
|
"aten/src/ATen/NativeFunctions.h",
|
|
|
|
|
"aten/src/ATen/MetaFunctions.h",
|
|
|
|
|
"aten/src/ATen/MetaFunctions_inl.h",
|
|
|
|
|
"aten/src/ATen/MethodOperators.h",
|
|
|
|
|
"aten/src/ATen/NativeMetaFunctions.h",
|
|
|
|
|
"aten/src/ATen/RegistrationDeclarations.h",
|
2022-07-27 19:14:43 +00:00
|
|
|
"aten/src/ATen/VmapGeneratedPlumbing.h",
|
2022-01-18 16:28:28 +00:00
|
|
|
"aten/src/ATen/core/aten_interned_strings.h",
|
2022-06-10 21:48:56 +00:00
|
|
|
"aten/src/ATen/core/enum_tag.h",
|
2022-01-05 14:55:27 +00:00
|
|
|
"aten/src/ATen/core/TensorBody.h",
|
|
|
|
|
"aten/src/ATen/core/TensorMethods.cpp",
|
|
|
|
|
"aten/src/ATen/core/ATenOpList.cpp",
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
generated_cuda_cpp = [
|
|
|
|
|
"aten/src/ATen/CUDAFunctions.h",
|
|
|
|
|
"aten/src/ATen/CUDAFunctions_inl.h",
|
|
|
|
|
"aten/src/ATen/RegisterCUDA.cpp",
|
2022-04-19 18:00:44 +00:00
|
|
|
"aten/src/ATen/RegisterNestedTensorCUDA.cpp",
|
2022-01-05 14:55:27 +00:00
|
|
|
"aten/src/ATen/RegisterQuantizedCUDA.cpp",
|
|
|
|
|
"aten/src/ATen/RegisterSparseCUDA.cpp",
|
|
|
|
|
"aten/src/ATen/RegisterSparseCsrCUDA.cpp",
|
|
|
|
|
]
|
|
|
|
|
|
2021-12-14 14:38:39 +00:00
|
|
|
generate_aten(
|
2022-01-05 14:55:27 +00:00
|
|
|
name = "generated_aten_cpp",
|
2021-12-17 21:41:24 +00:00
|
|
|
srcs = aten_generation_srcs,
|
ufunc codegen (#65851)
Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/65851
Design doc: https://docs.google.com/document/d/12rtlHnPUpaJ-I52Iob3L0WA3rKRr_OY7fXqeCvn2MVY/edit
First read the design doc to understand the user syntax. In this PR, we have converted add to use ufunc codegen; most of the cpp changes are deleting the preexisting implementations of add, and ufunc/add.h are the new implementations in the ufunc format.
The bulk of this PR is in the new codegen machinery. Here's the order to read the files:
* `tools/codegen/model.py`
* Some self-explanatory utility classes: `ScalarType`, `DTYPE_CLASSES`
* New classes for representing ufunc entries in `native_functions.yaml`: `UfuncKey` and `UfuncInnerLoop`, as well as parsing logic for these entries. UfuncKey has some unusual entries (e.g., CPUScalar) that don't show up in the documentation, more on these below).
* A predicate `is_ufunc_dispatch_key` for testing which dispatch keys should get automatically generated when an operator opts into ufuncs (CPU and CUDA, for now!)
* `tools/codegen/api/types.py`
* More self-explanatory utility stuff: ScalarTypeToCppMapping mapping ScalarType to CppTypes; Binding.rename for changing the name of a binding (used when we assign constructor variables to member variables inside CUDA functors)
* New VectorizedCType, representing `at::vec::Vectorized<T>`. This is used inside vectorized CPU codegen.
* New `scalar_t` and `opmath_t` BaseCppTypes, representing template parameters that we work with when doing codegen inside ufunc kernel loops (e.g., where you previously had Tensor, now you have `scalar_t`)
* `StructuredImplSignature` represents a `TORCH_IMPL_FUNC` definition, and straightforwardly follows from preexisting `tools.codegen.api.structured`
* `tools/codegen/translate.py` - Yes, we use translate a LOT in this PR. I improved some of the documentation, the only substantive changes are adding two new conversions: given a `scalar_t` or a `const Scalar&`, make it convertible to an `opmath_t`
* `tools/codegen/api/ufunc.py`
* OK, now we're at the meaty stuff. This file represents the calling conventions of three important concepts in ufunc codegen, which we'll describe shortly. All of these APIs are relatively simple, since there aren't any complicated types by the time you get to kernels.
* stubs are the DispatchStub trampolines that CPU kernels use to get to their vectorized versions. They drop all Tensor arguments (as they are in TensorIterator) but otherwise match the structured calling convention
* ufuncs are the inner loop template functions that you wrote in ufunc/add.h which do the actual computation in question. Here, all the Tensors and Scalars have been converted into the computation type (`opmath_t` in CUDA, `scalar_t` in CPU)
* ufunctors are a CUDA-only concept representing functors that take some of their arguments on a host-side constructor, and the rest in the device-side apply. Once again, Tensors and Scalars are converted into the computation type, `opmath_t`, but for clarity all the functions take `scalar_t` as argument (as this is the type that is most salient at the call site). Because the constructor and apply are code generated separately, `ufunctor_arguments` returns a teeny struct `UfunctorBindings`
* `tools/codegen/dest/ufunc.py` - the workhorse. This gets its own section below.
* `tools/codegen/gen.py` - just calling out to the new dest.ufunc implementation to generate UfuncCPU_add.cpp, UFuncCPUKernel_add.cpp and UfuncCUDA_add.cu files per ufunc operator. Each of these files does what you expect (small file that registers kernel and calls stub; CPU implementation; CUDA implementation). There is a new file manager for UFuncCPUKernel files as these need to get replicated by cmake for vectorization. One little trick to avoid recompilation is we directly replicate code generated forward declarations in these files, to reduce the number of headers we depend on (this is codegen, we're just doing the preprocessors job!)
* I'll talk about build system adjustments below.
OK, let's talk about tools/codegen/dest/ufunc.py. This file can be roughly understood in two halves: one for CPU code generation, and the other for CUDA code generation.
**CPU codegen.** Here's roughly what we want to generate:
```
// in UfuncCPU_add.cpp
using add_fn = void (*)(TensorIteratorBase&, const at::Scalar&);
DECLARE_DISPATCH(add_fn, add_stub);
DEFINE_DISPATCH(add_stub);
TORCH_IMPL_FUNC(ufunc_add_CPU)
(const at::Tensor& self, const at::Tensor& other, const at::Scalar& alpha, const at::Tensor& out) {
add_stub(device_type(), *this, alpha);
}
// in UfuncCPUKernel_add.cpp
void add_kernel(TensorIteratorBase& iter, const at::Scalar& alpha) {
at::ScalarType st = iter.common_dtype();
RECORD_KERNEL_FUNCTION_DTYPE("add_stub", st);
switch (st) {
AT_PRIVATE_CASE_TYPE("add_stub", at::ScalarType::Bool, bool, [&]() {
auto _s_alpha = alpha.to<scalar_t>();
cpu_kernel(iter, [=](scalar_t self, scalar_t other) {
return ufunc::add(self, other, _s_alpha);
});
})
AT_PRIVATE_CASE_TYPE(
"add_stub", at::ScalarType::ComplexFloat, c10::complex<float>, [&]() {
auto _s_alpha = alpha.to<scalar_t>();
auto _v_alpha = at::vec::Vectorized<scalar_t>(_s_alpha);
cpu_kernel_vec(
iter,
[=](scalar_t self, scalar_t other) {
return ufunc::add(self, other, _s_alpha);
},
[=](at::vec::Vectorized<scalar_t> self,
at::vec::Vectorized<scalar_t> other) {
return ufunc::add(self, other, _v_alpha);
});
})
...
```
The most interesting change about the generated code is what previously was an `AT_DISPATCH` macro invocation is now an unrolled loop. This makes it easier to vary behavior per-dtype (you can see in this example that the entry for bool and float differ) without having to add extra condtionals on top.
Otherwise, to generate this code, we have to hop through several successive API changes:
* In TORCH_IMPL_FUNC(ufunc_add_CPU), go from StructuredImplSignature to StubSignature (call the stub). This is normal argument massaging in the classic translate style.
* In add_kernel, go from StubSignature to UfuncSignature. This is nontrivial, because we must do various conversions outside of the inner kernel loop. These conversions are done by hand, setting up the context appropriately, and then the final ufunc call is done using translate. (BTW, I introduce a new convention here, call on a Signature, for code generating a C++ call, and I think we should try to use this convention elsewhere)
The other piece of nontrivial logic is the reindexing by dtype. This reindexing exists because the native_functions.yaml format is indexed by UfuncKey:
```
Generic: add (AllAndComplex, BFloat16, Half)
ScalarOnly: add (Bool)
```
but when we do code generation, we case on dtype first, and then we generate a `cpu_kernel` or `cpu_kernel_vec` call. We also don't care about CUDA code generation (which Generic) hits. Do this, we lower these keys into two low level keys, CPUScalar and CPUVector, which represent the CPU scalar and CPU vectorized ufuncs, respectively (Generic maps to CPUScalar and CPUVector, while ScalarOnly maps to CPUScalar only). Reindexing then gives us:
```
AllAndComplex:
CPUScalar: add
CPUVector: add
Bool:
CPUScalar: add
...
```
which is a good format for code generation, but too wordy to force native_functions.yaml authors to write. Note that when reindexing, it is possible for there to be a conflicting definition for the same dtype; we just define a precedence order and have one override the other, so that it is easy to specialize on a particular dtype if necessary. Also note that because CPUScalar/CPUVector are part of UfuncKey, technically you can manually specify them in native_functions.yaml, although I don't expect this functionality to be used.
**CUDA codegen.** CUDA code generation has many of the same ideas as CPU codegen, but it needs to know about functors, and stubs are handled slightly differently. Here is what we want to generate:
```
template <typename scalar_t>
struct CUDAFunctorOnSelf_add {
using opmath_t = at::opmath_type<scalar_t>;
opmath_t other_;
opmath_t alpha_;
CUDAFunctorOnSelf_add(opmath_t other, opmath_t alpha)
: other_(other), alpha_(alpha) {}
__device__ scalar_t operator()(scalar_t self) {
return ufunc::add(static_cast<opmath_t>(self), other_, alpha_);
}
};
... two more functors ...
void add_kernel(TensorIteratorBase& iter, const at::Scalar & alpha) {
TensorIteratorBase& iter = *this;
at::ScalarType st = iter.common_dtype();
RECORD_KERNEL_FUNCTION_DTYPE("ufunc_add_CUDA", st);
switch (st) {
AT_PRIVATE_CASE_TYPE("ufunc_add_CUDA", at::ScalarType::Bool, bool, [&]() {
using opmath_t = at::opmath_type<scalar_t>;
if (false) {
} else if (iter.is_cpu_scalar(1)) {
CUDAFunctorOnOther_add<scalar_t> ufunctor(
iter.scalar_value<opmath_t>(1), (alpha).to<opmath_t>());
iter.remove_operand(1);
gpu_kernel(iter, ufunctor);
} else if (iter.is_cpu_scalar(2)) {
CUDAFunctorOnSelf_add<scalar_t> ufunctor(
iter.scalar_value<opmath_t>(2), (alpha).to<opmath_t>());
iter.remove_operand(2);
gpu_kernel(iter, ufunctor);
} else {
gpu_kernel(iter, CUDAFunctor_add<scalar_t>((alpha).to<opmath_t>()));
}
})
...
REGISTER_DISPATCH(add_stub, &add_kernel);
TORCH_IMPL_FUNC(ufunc_add_CUDA)
(const at::Tensor& self,
const at::Tensor& other,
const at::Scalar& alpha,
const at::Tensor& out) {
add_kernel(*this, alpha);
}
```
The functor business is the bulk of the complexity. Like CPU, we decompose CUDA implementation into three low-level keys: CUDAFunctor (normal, all CUDA kernels will have this), and CUDAFunctorOnOther/CUDAFunctorOnScalar (these are to support Tensor-Scalar specializations when the Scalar lives on CPU). Both Generic and ScalarOnly provide ufuncs for CUDAFunctor, but for us to also lift these into Tensor-Scalar specializations, the operator itself must be eligible for Tensor-Scalar specialization. At the moment, this is hardcoded to be all binary operators, but in the future we can use tags in native_functions.yaml to disambiguate (or perhaps expand codegen to handle n-ary operators).
The reindexing process not only reassociates ufuncs by dtype, but it also works out if Tensor-Scalar specializations are needed and codegens the ufunctors necessary for the level of specialization here (`compute_ufunc_cuda_functors`). Generating the actual kernel (`compute_ufunc_cuda_dtype_body`) just consists of, for each specialization, constructing the functor and then passing it off to `gpu_kernel`. Most of the hard work is in functor generation, where we take care to make sure `operator()` has the correct input and output types (which `gpu_kernel` uses to arrange for memory accesses to the actual CUDA tensor; if you get these types wrong, your kernel will still work, it will just run very slowly!)
There is one big subtlety with CUDA codegen: this won't work:
```
Generic: add (AllAndComplex, BFloat16, Half)
ScalarOnly: add_bool (Bool)
```
This is because, even though there are separate Generic/ScalarOnly entries, we only generate a single functor to cover ALL dtypes in this case, and the functor has the ufunc name hardcoded into it. You'll get an error if you try to do this; to fix it, just make sure the ufunc is named the same consistently throughout. In the code, you see this because after testing for the short circuit case (when a user provided the functor themselves), we squash all the generic entries together and assert their ufunc names are the same. Hypothetically, if we generated a separate functor per dtype, we could support differently named ufuncs but... why would you do that to yourself. (One piece of nastiness is that the native_functions.yaml syntax doesn't stop you from shooting yourself in the foot.)
A brief word about CUDA stubs: technically, they are not necessary, as there is no CPU/CPUKernel style split for CUDA kernels (so, if you look, structured impl actually calls add_kernel directly). However, there is some code that still makes use of CUDA stubs (in particular, I use the stub to conveniently reimplement sub in terms of add), so we still register it. This might be worth frying some more at a later point in time.
**Build system changes.** If you are at FB, you should review these changes in fbcode, as there are several changes in files that are not exported to ShipIt.
The build system changes in this patch are substantively complicated by the fact that I have to implement these changes five times:
* OSS cmake build
* OSS Bazel build
* FB fbcode Buck build
* FB xplat Buck build (selective build)
* FB ovrsource Buck build
Due to technical limitations in the xplat Buck build related to selective build, it is required that you list every ufunc header manually (this is done in tools/build_variables.bzl)
The OSS cmake changes are entirely in cmake/Codegen.cmake there is a new set of files cpu_vec_generated (corresponding to UfuncCPUKernel files) which is wired up in the same way as other files. These files are different because they need to get compiled multiple times under different vectorization settings. I adjust the codegen, slightly refactoring the inner loop into its own function so I can use different base path calculation depending on if the file is traditional (in the native/cpu folder) or generated (new stuff from this diff.
The Bazel/Buck changes are organized around tools/build_variables.bzl, which contain the canonical list of ufunc headers (aten_ufunc_headers), and tools/ufunc_defs.bzl (added to ShipIt export list in D34465699) which defines a number of functions that compute the generated cpu, cpu kernel and cuda files based on the headers list. For convenience, these functions take a genpattern (a string with a {} for interpolation) which can be used to easily reformat the list of formats in target form, which is commonly needed in the build systems.
The split between build_variables.bzl and ufunc_defs.bzl is required because build_variables.bzl is executed by a conventional Python interpreter as part of the OSS cmake, but we require Skylark features to implement the functions in ufunc_defs.bzl (I did some quick Googling but didn't find a lightweight way to run the Skylark interpreter in open source.)
With these new file lists, the rest of the build changes are mostly inserting references to these files wherever necessary; in particular, cpu kernel files have to be worked into the multiple vectorization build flow (intern_build_aten_ops in OSS Bazel). Most of the subtlety relates to selective build. Selective build requires operator files to be copied per overall selective build; as dhruvbird explains to me, glob expansion happens during the action graph phase, but the selective build handling of TEMPLATE_SOURCE_LIST is referencing the target graph. In other words, we can't use a glob to generate deps for another rule, because we need to copy files from wherever (included generated files) to a staging folder so the rules can pick them up.
It can be somewhat confusing to understand which bzl files are associated with which build. Here are the relevant mappings for files I edited:
* Used by everyone - tools/build_tools.bzl, tools/ufunc_defs.bzl
* OSS Bazel - aten.bzl, BUILD.bazel
* FB fbcode Buck - TARGETS
* FB xplat Buck -BUCK, pt_defs.bzl, pt_template_srcs.bzl
* FB ovrsource Buck - ovrsource_defs.bzl, pt_defs.bzl
Note that pt_defs.bzl is used by both xplat and ovrsource. This leads to the "tiresome" handling for enabled backends, as selective build is CPU only, but ovrsource is CPU and CUDA.
BTW, while I was at it, I beefed up fb/build_arvr.sh to also do a CUDA ovrsource build, which was not triggered previously.
Signed-off-by: Edward Z. Yang <ezyang@fb.com>
Test Plan: Imported from OSS
Reviewed By: albanD
Differential Revision: D31306586
Pulled By: ezyang
fbshipit-source-id: 210258ce83f578f79cf91b77bfaeac34945a00c6
(cherry picked from commit d65157b0b894b6701ee062f05a5f57790a06c91c)
2022-02-28 23:46:04 +00:00
|
|
|
outs = (
|
|
|
|
|
generated_cpu_cpp +
|
|
|
|
|
generated_cuda_cpp +
|
|
|
|
|
aten_ufunc_generated_cpu_sources("aten/src/ATen/{}") +
|
|
|
|
|
aten_ufunc_generated_cpu_kernel_sources("aten/src/ATen/{}") +
|
2023-03-26 10:27:18 +00:00
|
|
|
aten_ufunc_generated_cuda_sources("aten/src/ATen/{}") + [
|
|
|
|
|
"aten/src/ATen/Declarations.yaml",
|
|
|
|
|
]
|
ufunc codegen (#65851)
Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/65851
Design doc: https://docs.google.com/document/d/12rtlHnPUpaJ-I52Iob3L0WA3rKRr_OY7fXqeCvn2MVY/edit
First read the design doc to understand the user syntax. In this PR, we have converted add to use ufunc codegen; most of the cpp changes are deleting the preexisting implementations of add, and ufunc/add.h are the new implementations in the ufunc format.
The bulk of this PR is in the new codegen machinery. Here's the order to read the files:
* `tools/codegen/model.py`
* Some self-explanatory utility classes: `ScalarType`, `DTYPE_CLASSES`
* New classes for representing ufunc entries in `native_functions.yaml`: `UfuncKey` and `UfuncInnerLoop`, as well as parsing logic for these entries. UfuncKey has some unusual entries (e.g., CPUScalar) that don't show up in the documentation, more on these below).
* A predicate `is_ufunc_dispatch_key` for testing which dispatch keys should get automatically generated when an operator opts into ufuncs (CPU and CUDA, for now!)
* `tools/codegen/api/types.py`
* More self-explanatory utility stuff: ScalarTypeToCppMapping mapping ScalarType to CppTypes; Binding.rename for changing the name of a binding (used when we assign constructor variables to member variables inside CUDA functors)
* New VectorizedCType, representing `at::vec::Vectorized<T>`. This is used inside vectorized CPU codegen.
* New `scalar_t` and `opmath_t` BaseCppTypes, representing template parameters that we work with when doing codegen inside ufunc kernel loops (e.g., where you previously had Tensor, now you have `scalar_t`)
* `StructuredImplSignature` represents a `TORCH_IMPL_FUNC` definition, and straightforwardly follows from preexisting `tools.codegen.api.structured`
* `tools/codegen/translate.py` - Yes, we use translate a LOT in this PR. I improved some of the documentation, the only substantive changes are adding two new conversions: given a `scalar_t` or a `const Scalar&`, make it convertible to an `opmath_t`
* `tools/codegen/api/ufunc.py`
* OK, now we're at the meaty stuff. This file represents the calling conventions of three important concepts in ufunc codegen, which we'll describe shortly. All of these APIs are relatively simple, since there aren't any complicated types by the time you get to kernels.
* stubs are the DispatchStub trampolines that CPU kernels use to get to their vectorized versions. They drop all Tensor arguments (as they are in TensorIterator) but otherwise match the structured calling convention
* ufuncs are the inner loop template functions that you wrote in ufunc/add.h which do the actual computation in question. Here, all the Tensors and Scalars have been converted into the computation type (`opmath_t` in CUDA, `scalar_t` in CPU)
* ufunctors are a CUDA-only concept representing functors that take some of their arguments on a host-side constructor, and the rest in the device-side apply. Once again, Tensors and Scalars are converted into the computation type, `opmath_t`, but for clarity all the functions take `scalar_t` as argument (as this is the type that is most salient at the call site). Because the constructor and apply are code generated separately, `ufunctor_arguments` returns a teeny struct `UfunctorBindings`
* `tools/codegen/dest/ufunc.py` - the workhorse. This gets its own section below.
* `tools/codegen/gen.py` - just calling out to the new dest.ufunc implementation to generate UfuncCPU_add.cpp, UFuncCPUKernel_add.cpp and UfuncCUDA_add.cu files per ufunc operator. Each of these files does what you expect (small file that registers kernel and calls stub; CPU implementation; CUDA implementation). There is a new file manager for UFuncCPUKernel files as these need to get replicated by cmake for vectorization. One little trick to avoid recompilation is we directly replicate code generated forward declarations in these files, to reduce the number of headers we depend on (this is codegen, we're just doing the preprocessors job!)
* I'll talk about build system adjustments below.
OK, let's talk about tools/codegen/dest/ufunc.py. This file can be roughly understood in two halves: one for CPU code generation, and the other for CUDA code generation.
**CPU codegen.** Here's roughly what we want to generate:
```
// in UfuncCPU_add.cpp
using add_fn = void (*)(TensorIteratorBase&, const at::Scalar&);
DECLARE_DISPATCH(add_fn, add_stub);
DEFINE_DISPATCH(add_stub);
TORCH_IMPL_FUNC(ufunc_add_CPU)
(const at::Tensor& self, const at::Tensor& other, const at::Scalar& alpha, const at::Tensor& out) {
add_stub(device_type(), *this, alpha);
}
// in UfuncCPUKernel_add.cpp
void add_kernel(TensorIteratorBase& iter, const at::Scalar& alpha) {
at::ScalarType st = iter.common_dtype();
RECORD_KERNEL_FUNCTION_DTYPE("add_stub", st);
switch (st) {
AT_PRIVATE_CASE_TYPE("add_stub", at::ScalarType::Bool, bool, [&]() {
auto _s_alpha = alpha.to<scalar_t>();
cpu_kernel(iter, [=](scalar_t self, scalar_t other) {
return ufunc::add(self, other, _s_alpha);
});
})
AT_PRIVATE_CASE_TYPE(
"add_stub", at::ScalarType::ComplexFloat, c10::complex<float>, [&]() {
auto _s_alpha = alpha.to<scalar_t>();
auto _v_alpha = at::vec::Vectorized<scalar_t>(_s_alpha);
cpu_kernel_vec(
iter,
[=](scalar_t self, scalar_t other) {
return ufunc::add(self, other, _s_alpha);
},
[=](at::vec::Vectorized<scalar_t> self,
at::vec::Vectorized<scalar_t> other) {
return ufunc::add(self, other, _v_alpha);
});
})
...
```
The most interesting change about the generated code is what previously was an `AT_DISPATCH` macro invocation is now an unrolled loop. This makes it easier to vary behavior per-dtype (you can see in this example that the entry for bool and float differ) without having to add extra condtionals on top.
Otherwise, to generate this code, we have to hop through several successive API changes:
* In TORCH_IMPL_FUNC(ufunc_add_CPU), go from StructuredImplSignature to StubSignature (call the stub). This is normal argument massaging in the classic translate style.
* In add_kernel, go from StubSignature to UfuncSignature. This is nontrivial, because we must do various conversions outside of the inner kernel loop. These conversions are done by hand, setting up the context appropriately, and then the final ufunc call is done using translate. (BTW, I introduce a new convention here, call on a Signature, for code generating a C++ call, and I think we should try to use this convention elsewhere)
The other piece of nontrivial logic is the reindexing by dtype. This reindexing exists because the native_functions.yaml format is indexed by UfuncKey:
```
Generic: add (AllAndComplex, BFloat16, Half)
ScalarOnly: add (Bool)
```
but when we do code generation, we case on dtype first, and then we generate a `cpu_kernel` or `cpu_kernel_vec` call. We also don't care about CUDA code generation (which Generic) hits. Do this, we lower these keys into two low level keys, CPUScalar and CPUVector, which represent the CPU scalar and CPU vectorized ufuncs, respectively (Generic maps to CPUScalar and CPUVector, while ScalarOnly maps to CPUScalar only). Reindexing then gives us:
```
AllAndComplex:
CPUScalar: add
CPUVector: add
Bool:
CPUScalar: add
...
```
which is a good format for code generation, but too wordy to force native_functions.yaml authors to write. Note that when reindexing, it is possible for there to be a conflicting definition for the same dtype; we just define a precedence order and have one override the other, so that it is easy to specialize on a particular dtype if necessary. Also note that because CPUScalar/CPUVector are part of UfuncKey, technically you can manually specify them in native_functions.yaml, although I don't expect this functionality to be used.
**CUDA codegen.** CUDA code generation has many of the same ideas as CPU codegen, but it needs to know about functors, and stubs are handled slightly differently. Here is what we want to generate:
```
template <typename scalar_t>
struct CUDAFunctorOnSelf_add {
using opmath_t = at::opmath_type<scalar_t>;
opmath_t other_;
opmath_t alpha_;
CUDAFunctorOnSelf_add(opmath_t other, opmath_t alpha)
: other_(other), alpha_(alpha) {}
__device__ scalar_t operator()(scalar_t self) {
return ufunc::add(static_cast<opmath_t>(self), other_, alpha_);
}
};
... two more functors ...
void add_kernel(TensorIteratorBase& iter, const at::Scalar & alpha) {
TensorIteratorBase& iter = *this;
at::ScalarType st = iter.common_dtype();
RECORD_KERNEL_FUNCTION_DTYPE("ufunc_add_CUDA", st);
switch (st) {
AT_PRIVATE_CASE_TYPE("ufunc_add_CUDA", at::ScalarType::Bool, bool, [&]() {
using opmath_t = at::opmath_type<scalar_t>;
if (false) {
} else if (iter.is_cpu_scalar(1)) {
CUDAFunctorOnOther_add<scalar_t> ufunctor(
iter.scalar_value<opmath_t>(1), (alpha).to<opmath_t>());
iter.remove_operand(1);
gpu_kernel(iter, ufunctor);
} else if (iter.is_cpu_scalar(2)) {
CUDAFunctorOnSelf_add<scalar_t> ufunctor(
iter.scalar_value<opmath_t>(2), (alpha).to<opmath_t>());
iter.remove_operand(2);
gpu_kernel(iter, ufunctor);
} else {
gpu_kernel(iter, CUDAFunctor_add<scalar_t>((alpha).to<opmath_t>()));
}
})
...
REGISTER_DISPATCH(add_stub, &add_kernel);
TORCH_IMPL_FUNC(ufunc_add_CUDA)
(const at::Tensor& self,
const at::Tensor& other,
const at::Scalar& alpha,
const at::Tensor& out) {
add_kernel(*this, alpha);
}
```
The functor business is the bulk of the complexity. Like CPU, we decompose CUDA implementation into three low-level keys: CUDAFunctor (normal, all CUDA kernels will have this), and CUDAFunctorOnOther/CUDAFunctorOnScalar (these are to support Tensor-Scalar specializations when the Scalar lives on CPU). Both Generic and ScalarOnly provide ufuncs for CUDAFunctor, but for us to also lift these into Tensor-Scalar specializations, the operator itself must be eligible for Tensor-Scalar specialization. At the moment, this is hardcoded to be all binary operators, but in the future we can use tags in native_functions.yaml to disambiguate (or perhaps expand codegen to handle n-ary operators).
The reindexing process not only reassociates ufuncs by dtype, but it also works out if Tensor-Scalar specializations are needed and codegens the ufunctors necessary for the level of specialization here (`compute_ufunc_cuda_functors`). Generating the actual kernel (`compute_ufunc_cuda_dtype_body`) just consists of, for each specialization, constructing the functor and then passing it off to `gpu_kernel`. Most of the hard work is in functor generation, where we take care to make sure `operator()` has the correct input and output types (which `gpu_kernel` uses to arrange for memory accesses to the actual CUDA tensor; if you get these types wrong, your kernel will still work, it will just run very slowly!)
There is one big subtlety with CUDA codegen: this won't work:
```
Generic: add (AllAndComplex, BFloat16, Half)
ScalarOnly: add_bool (Bool)
```
This is because, even though there are separate Generic/ScalarOnly entries, we only generate a single functor to cover ALL dtypes in this case, and the functor has the ufunc name hardcoded into it. You'll get an error if you try to do this; to fix it, just make sure the ufunc is named the same consistently throughout. In the code, you see this because after testing for the short circuit case (when a user provided the functor themselves), we squash all the generic entries together and assert their ufunc names are the same. Hypothetically, if we generated a separate functor per dtype, we could support differently named ufuncs but... why would you do that to yourself. (One piece of nastiness is that the native_functions.yaml syntax doesn't stop you from shooting yourself in the foot.)
A brief word about CUDA stubs: technically, they are not necessary, as there is no CPU/CPUKernel style split for CUDA kernels (so, if you look, structured impl actually calls add_kernel directly). However, there is some code that still makes use of CUDA stubs (in particular, I use the stub to conveniently reimplement sub in terms of add), so we still register it. This might be worth frying some more at a later point in time.
**Build system changes.** If you are at FB, you should review these changes in fbcode, as there are several changes in files that are not exported to ShipIt.
The build system changes in this patch are substantively complicated by the fact that I have to implement these changes five times:
* OSS cmake build
* OSS Bazel build
* FB fbcode Buck build
* FB xplat Buck build (selective build)
* FB ovrsource Buck build
Due to technical limitations in the xplat Buck build related to selective build, it is required that you list every ufunc header manually (this is done in tools/build_variables.bzl)
The OSS cmake changes are entirely in cmake/Codegen.cmake there is a new set of files cpu_vec_generated (corresponding to UfuncCPUKernel files) which is wired up in the same way as other files. These files are different because they need to get compiled multiple times under different vectorization settings. I adjust the codegen, slightly refactoring the inner loop into its own function so I can use different base path calculation depending on if the file is traditional (in the native/cpu folder) or generated (new stuff from this diff.
The Bazel/Buck changes are organized around tools/build_variables.bzl, which contain the canonical list of ufunc headers (aten_ufunc_headers), and tools/ufunc_defs.bzl (added to ShipIt export list in D34465699) which defines a number of functions that compute the generated cpu, cpu kernel and cuda files based on the headers list. For convenience, these functions take a genpattern (a string with a {} for interpolation) which can be used to easily reformat the list of formats in target form, which is commonly needed in the build systems.
The split between build_variables.bzl and ufunc_defs.bzl is required because build_variables.bzl is executed by a conventional Python interpreter as part of the OSS cmake, but we require Skylark features to implement the functions in ufunc_defs.bzl (I did some quick Googling but didn't find a lightweight way to run the Skylark interpreter in open source.)
With these new file lists, the rest of the build changes are mostly inserting references to these files wherever necessary; in particular, cpu kernel files have to be worked into the multiple vectorization build flow (intern_build_aten_ops in OSS Bazel). Most of the subtlety relates to selective build. Selective build requires operator files to be copied per overall selective build; as dhruvbird explains to me, glob expansion happens during the action graph phase, but the selective build handling of TEMPLATE_SOURCE_LIST is referencing the target graph. In other words, we can't use a glob to generate deps for another rule, because we need to copy files from wherever (included generated files) to a staging folder so the rules can pick them up.
It can be somewhat confusing to understand which bzl files are associated with which build. Here are the relevant mappings for files I edited:
* Used by everyone - tools/build_tools.bzl, tools/ufunc_defs.bzl
* OSS Bazel - aten.bzl, BUILD.bazel
* FB fbcode Buck - TARGETS
* FB xplat Buck -BUCK, pt_defs.bzl, pt_template_srcs.bzl
* FB ovrsource Buck - ovrsource_defs.bzl, pt_defs.bzl
Note that pt_defs.bzl is used by both xplat and ovrsource. This leads to the "tiresome" handling for enabled backends, as selective build is CPU only, but ovrsource is CPU and CUDA.
BTW, while I was at it, I beefed up fb/build_arvr.sh to also do a CUDA ovrsource build, which was not triggered previously.
Signed-off-by: Edward Z. Yang <ezyang@fb.com>
Test Plan: Imported from OSS
Reviewed By: albanD
Differential Revision: D31306586
Pulled By: ezyang
fbshipit-source-id: 210258ce83f578f79cf91b77bfaeac34945a00c6
(cherry picked from commit d65157b0b894b6701ee062f05a5f57790a06c91c)
2022-02-28 23:46:04 +00:00
|
|
|
),
|
Rename tools/codegen to torchgen (#76275)
Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/76275
In preparation for addressing
https://github.com/pytorch/pytorch/issues/73212
Diff was generated with:
```
git mv tools/codegen torchgen
git grep -l 'tools.codegen' | xargs sed -i 's/tools.codegen/torchgen/g'
sed -i "s/\${TOOLS_PATH}\/codegen/\${TORCH_ROOT}\/torchgen/g" caffe2/CMakeLists.txt
```
and a manual edits to:
* tools/test/test_gen_backend_stubs.py
* torchgen/build.bzl
* torchgen/gen_backend_stubs.py
aka this diff:
```
diff --git a/tools/test/test_gen_backend_stubs.py b/tools/test/test_gen_backend_stubs.py
index 3dc26c6d2d..104054575e 100644
--- a/tools/test/test_gen_backend_stubs.py
+++ b/tools/test/test_gen_backend_stubs.py
@@ -9,7 +9,7 @@ from torchgen.gen_backend_stubs import run
from torchgen.gen import _GLOBAL_PARSE_NATIVE_YAML_CACHE # noqa: F401
path = os.path.dirname(os.path.realpath(__file__))
-gen_backend_stubs_path = os.path.join(path, '../torchgen/gen_backend_stubs.py')
+gen_backend_stubs_path = os.path.join(path, '../../torchgen/gen_backend_stubs.py')
# gen_backend_stubs.py is an integration point that is called directly by external backends.
# The tests here are to confirm that badly formed inputs result in reasonable error messages.
diff --git a/torchgen/build.bzl b/torchgen/build.bzl
index ed04e35a43..d00078a3cf 100644
--- a/torchgen/build.bzl
+++ b/torchgen/build.bzl
@@ -1,6 +1,6 @@
def define_targets(rules):
rules.py_library(
- name = "codegen",
+ name = "torchgen",
srcs = rules.glob(["**/*.py"]),
deps = [
rules.requirement("PyYAML"),
@@ -11,6 +11,6 @@ def define_targets(rules):
rules.py_binary(
name = "gen",
- srcs = [":codegen"],
+ srcs = [":torchgen"],
visibility = ["//visibility:public"],
)
diff --git a/torchgen/gen_backend_stubs.py b/torchgen/gen_backend_stubs.py
index c1a672a655..beee7a15e0 100644
--- a/torchgen/gen_backend_stubs.py
+++ b/torchgen/gen_backend_stubs.py
@@ -474,7 +474,7 @@ def run(
) -> None:
# Assumes that this file lives at PYTORCH_ROOT/torchgen/gen_backend_stubs.py
- pytorch_root = pathlib.Path(__file__).parent.parent.parent.absolute()
+ pytorch_root = pathlib.Path(__file__).parent.parent.absolute()
template_dir = os.path.join(pytorch_root, "aten/src/ATen/templates")
def make_file_manager(install_dir: str) -> FileManager:
```
run_all_fbandroid_tests
Test Plan: sandcastle
Reviewed By: albanD, ngimel
Differential Revision: D35770317
fbshipit-source-id: 153ac4a7fef15b1e750812a90bfafdbc8f1ebcdf
(cherry picked from commit c6d485d1d4648fa1c8a4c14c5bf3d8e899b9b4dd)
2022-04-25 01:32:01 +00:00
|
|
|
generator = "//torchgen:gen",
|
2020-04-07 05:48:33 +00:00
|
|
|
)
|
|
|
|
|
|
2020-05-20 05:49:28 +00:00
|
|
|
filegroup(
|
|
|
|
|
name = "cpp_generated_code",
|
2022-05-16 21:40:31 +00:00
|
|
|
srcs = GENERATED_AUTOGRAD_CPP,
|
2023-03-26 10:27:18 +00:00
|
|
|
data = [":generate-code"],
|
2020-05-20 05:49:28 +00:00
|
|
|
)
|
|
|
|
|
|
2020-04-07 05:48:33 +00:00
|
|
|
# ATen
|
|
|
|
|
filegroup(
|
|
|
|
|
name = "aten_base_cpp",
|
|
|
|
|
srcs = glob([
|
|
|
|
|
"aten/src/ATen/*.cpp",
|
2022-09-09 15:00:04 +00:00
|
|
|
"aten/src/ATen/functorch/*.cpp",
|
2020-04-07 05:48:33 +00:00
|
|
|
"aten/src/ATen/detail/*.cpp",
|
|
|
|
|
"aten/src/ATen/cpu/*.cpp",
|
|
|
|
|
]),
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
filegroup(
|
|
|
|
|
name = "ATen_CORE_SRCS",
|
|
|
|
|
srcs = glob(
|
|
|
|
|
[
|
|
|
|
|
"aten/src/ATen/core/**/*.cpp",
|
|
|
|
|
],
|
|
|
|
|
exclude = [
|
|
|
|
|
"aten/src/ATen/core/**/*_test.cpp",
|
|
|
|
|
],
|
|
|
|
|
),
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
filegroup(
|
|
|
|
|
name = "aten_native_cpp",
|
|
|
|
|
srcs = glob(["aten/src/ATen/native/*.cpp"]),
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
filegroup(
|
|
|
|
|
name = "aten_native_sparse_cpp",
|
|
|
|
|
srcs = glob(["aten/src/ATen/native/sparse/*.cpp"]),
|
|
|
|
|
)
|
|
|
|
|
|
2022-03-02 15:29:19 +00:00
|
|
|
filegroup(
|
|
|
|
|
name = "aten_native_nested_cpp",
|
|
|
|
|
srcs = glob(["aten/src/ATen/native/nested/*.cpp"]),
|
|
|
|
|
)
|
|
|
|
|
|
2020-04-07 05:48:33 +00:00
|
|
|
filegroup(
|
|
|
|
|
name = "aten_native_quantized_cpp",
|
|
|
|
|
srcs = glob(
|
|
|
|
|
[
|
|
|
|
|
"aten/src/ATen/native/quantized/*.cpp",
|
|
|
|
|
"aten/src/ATen/native/quantized/cpu/*.cpp",
|
|
|
|
|
],
|
|
|
|
|
),
|
|
|
|
|
)
|
|
|
|
|
|
2022-04-26 00:02:39 +00:00
|
|
|
filegroup(
|
|
|
|
|
name = "aten_native_transformers_cpp",
|
|
|
|
|
srcs = glob(["aten/src/ATen/native/transformers/*.cpp"]),
|
|
|
|
|
)
|
|
|
|
|
|
2020-04-07 05:48:33 +00:00
|
|
|
filegroup(
|
|
|
|
|
name = "aten_native_mkl_cpp",
|
2023-03-26 10:27:18 +00:00
|
|
|
srcs = glob([
|
|
|
|
|
"aten/src/ATen/native/mkl/*.cpp",
|
|
|
|
|
"aten/src/ATen/mkl/*.cpp",
|
|
|
|
|
]),
|
2020-04-07 05:48:33 +00:00
|
|
|
)
|
|
|
|
|
|
|
|
|
|
filegroup(
|
|
|
|
|
name = "aten_native_mkldnn_cpp",
|
|
|
|
|
srcs = glob(["aten/src/ATen/native/mkldnn/*.cpp"]),
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
filegroup(
|
|
|
|
|
name = "aten_native_xnnpack",
|
|
|
|
|
srcs = glob(["aten/src/ATen/native/xnnpack/*.cpp"]),
|
|
|
|
|
)
|
|
|
|
|
|
2020-05-26 18:20:51 +00:00
|
|
|
filegroup(
|
2020-08-07 16:04:01 +00:00
|
|
|
name = "aten_base_vulkan",
|
|
|
|
|
srcs = glob(["aten/src/ATen/vulkan/*.cpp"]),
|
2020-05-26 18:20:51 +00:00
|
|
|
)
|
|
|
|
|
|
2020-10-13 08:44:36 +00:00
|
|
|
filegroup(
|
|
|
|
|
name = "aten_base_metal",
|
|
|
|
|
srcs = glob(["aten/src/ATen/metal/*.cpp"]),
|
|
|
|
|
)
|
|
|
|
|
|
2020-04-07 05:48:33 +00:00
|
|
|
filegroup(
|
|
|
|
|
name = "ATen_QUANTIZED_SRCS",
|
|
|
|
|
srcs = glob(
|
|
|
|
|
[
|
|
|
|
|
"aten/src/ATen/quantized/**/*.cpp",
|
|
|
|
|
],
|
|
|
|
|
exclude = [
|
|
|
|
|
"aten/src/ATen/quantized/**/*_test.cpp",
|
|
|
|
|
],
|
|
|
|
|
),
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
filegroup(
|
2021-12-17 21:41:24 +00:00
|
|
|
name = "aten_cuda_cpp_srcs",
|
|
|
|
|
srcs = glob(
|
|
|
|
|
[
|
|
|
|
|
"aten/src/ATen/cuda/*.cpp",
|
|
|
|
|
"aten/src/ATen/cuda/detail/*.cpp",
|
2024-02-14 19:03:49 +00:00
|
|
|
"aten/src/ATen/cuda/tunable/*.cpp",
|
2021-12-17 21:41:24 +00:00
|
|
|
"aten/src/ATen/cudnn/*.cpp",
|
|
|
|
|
"aten/src/ATen/native/cuda/*.cpp",
|
2022-02-07 17:45:07 +00:00
|
|
|
"aten/src/ATen/native/cuda/linalg/*.cpp",
|
2021-12-17 21:41:24 +00:00
|
|
|
"aten/src/ATen/native/cudnn/*.cpp",
|
|
|
|
|
"aten/src/ATen/native/miopen/*.cpp",
|
2022-04-13 16:46:33 +00:00
|
|
|
"aten/src/ATen/native/nested/cuda/*.cpp",
|
2022-05-17 18:22:45 +00:00
|
|
|
"aten/src/ATen/native/quantized/cuda/*.cpp",
|
2022-04-27 01:47:29 +00:00
|
|
|
"aten/src/ATen/native/quantized/cudnn/*.cpp",
|
2021-12-17 21:41:24 +00:00
|
|
|
"aten/src/ATen/native/sparse/cuda/*.cpp",
|
2022-04-26 00:02:39 +00:00
|
|
|
"aten/src/ATen/native/transformers/cuda/*.cpp",
|
2021-12-17 21:41:24 +00:00
|
|
|
],
|
|
|
|
|
),
|
2020-04-07 05:48:33 +00:00
|
|
|
)
|
|
|
|
|
|
|
|
|
|
filegroup(
|
2021-12-17 21:41:24 +00:00
|
|
|
name = "aten_cu_srcs",
|
|
|
|
|
srcs = glob([
|
|
|
|
|
"aten/src/ATen/cuda/*.cu",
|
|
|
|
|
"aten/src/ATen/cuda/detail/*.cu",
|
|
|
|
|
"aten/src/ATen/native/cuda/*.cu",
|
2022-04-13 16:46:33 +00:00
|
|
|
"aten/src/ATen/native/nested/cuda/*.cu",
|
2021-12-17 21:41:24 +00:00
|
|
|
"aten/src/ATen/native/quantized/cuda/*.cu",
|
|
|
|
|
"aten/src/ATen/native/sparse/cuda/*.cu",
|
2022-04-26 00:02:39 +00:00
|
|
|
"aten/src/ATen/native/transformers/cuda/*.cu",
|
ufunc codegen (#65851)
Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/65851
Design doc: https://docs.google.com/document/d/12rtlHnPUpaJ-I52Iob3L0WA3rKRr_OY7fXqeCvn2MVY/edit
First read the design doc to understand the user syntax. In this PR, we have converted add to use ufunc codegen; most of the cpp changes are deleting the preexisting implementations of add, and ufunc/add.h are the new implementations in the ufunc format.
The bulk of this PR is in the new codegen machinery. Here's the order to read the files:
* `tools/codegen/model.py`
* Some self-explanatory utility classes: `ScalarType`, `DTYPE_CLASSES`
* New classes for representing ufunc entries in `native_functions.yaml`: `UfuncKey` and `UfuncInnerLoop`, as well as parsing logic for these entries. UfuncKey has some unusual entries (e.g., CPUScalar) that don't show up in the documentation, more on these below).
* A predicate `is_ufunc_dispatch_key` for testing which dispatch keys should get automatically generated when an operator opts into ufuncs (CPU and CUDA, for now!)
* `tools/codegen/api/types.py`
* More self-explanatory utility stuff: ScalarTypeToCppMapping mapping ScalarType to CppTypes; Binding.rename for changing the name of a binding (used when we assign constructor variables to member variables inside CUDA functors)
* New VectorizedCType, representing `at::vec::Vectorized<T>`. This is used inside vectorized CPU codegen.
* New `scalar_t` and `opmath_t` BaseCppTypes, representing template parameters that we work with when doing codegen inside ufunc kernel loops (e.g., where you previously had Tensor, now you have `scalar_t`)
* `StructuredImplSignature` represents a `TORCH_IMPL_FUNC` definition, and straightforwardly follows from preexisting `tools.codegen.api.structured`
* `tools/codegen/translate.py` - Yes, we use translate a LOT in this PR. I improved some of the documentation, the only substantive changes are adding two new conversions: given a `scalar_t` or a `const Scalar&`, make it convertible to an `opmath_t`
* `tools/codegen/api/ufunc.py`
* OK, now we're at the meaty stuff. This file represents the calling conventions of three important concepts in ufunc codegen, which we'll describe shortly. All of these APIs are relatively simple, since there aren't any complicated types by the time you get to kernels.
* stubs are the DispatchStub trampolines that CPU kernels use to get to their vectorized versions. They drop all Tensor arguments (as they are in TensorIterator) but otherwise match the structured calling convention
* ufuncs are the inner loop template functions that you wrote in ufunc/add.h which do the actual computation in question. Here, all the Tensors and Scalars have been converted into the computation type (`opmath_t` in CUDA, `scalar_t` in CPU)
* ufunctors are a CUDA-only concept representing functors that take some of their arguments on a host-side constructor, and the rest in the device-side apply. Once again, Tensors and Scalars are converted into the computation type, `opmath_t`, but for clarity all the functions take `scalar_t` as argument (as this is the type that is most salient at the call site). Because the constructor and apply are code generated separately, `ufunctor_arguments` returns a teeny struct `UfunctorBindings`
* `tools/codegen/dest/ufunc.py` - the workhorse. This gets its own section below.
* `tools/codegen/gen.py` - just calling out to the new dest.ufunc implementation to generate UfuncCPU_add.cpp, UFuncCPUKernel_add.cpp and UfuncCUDA_add.cu files per ufunc operator. Each of these files does what you expect (small file that registers kernel and calls stub; CPU implementation; CUDA implementation). There is a new file manager for UFuncCPUKernel files as these need to get replicated by cmake for vectorization. One little trick to avoid recompilation is we directly replicate code generated forward declarations in these files, to reduce the number of headers we depend on (this is codegen, we're just doing the preprocessors job!)
* I'll talk about build system adjustments below.
OK, let's talk about tools/codegen/dest/ufunc.py. This file can be roughly understood in two halves: one for CPU code generation, and the other for CUDA code generation.
**CPU codegen.** Here's roughly what we want to generate:
```
// in UfuncCPU_add.cpp
using add_fn = void (*)(TensorIteratorBase&, const at::Scalar&);
DECLARE_DISPATCH(add_fn, add_stub);
DEFINE_DISPATCH(add_stub);
TORCH_IMPL_FUNC(ufunc_add_CPU)
(const at::Tensor& self, const at::Tensor& other, const at::Scalar& alpha, const at::Tensor& out) {
add_stub(device_type(), *this, alpha);
}
// in UfuncCPUKernel_add.cpp
void add_kernel(TensorIteratorBase& iter, const at::Scalar& alpha) {
at::ScalarType st = iter.common_dtype();
RECORD_KERNEL_FUNCTION_DTYPE("add_stub", st);
switch (st) {
AT_PRIVATE_CASE_TYPE("add_stub", at::ScalarType::Bool, bool, [&]() {
auto _s_alpha = alpha.to<scalar_t>();
cpu_kernel(iter, [=](scalar_t self, scalar_t other) {
return ufunc::add(self, other, _s_alpha);
});
})
AT_PRIVATE_CASE_TYPE(
"add_stub", at::ScalarType::ComplexFloat, c10::complex<float>, [&]() {
auto _s_alpha = alpha.to<scalar_t>();
auto _v_alpha = at::vec::Vectorized<scalar_t>(_s_alpha);
cpu_kernel_vec(
iter,
[=](scalar_t self, scalar_t other) {
return ufunc::add(self, other, _s_alpha);
},
[=](at::vec::Vectorized<scalar_t> self,
at::vec::Vectorized<scalar_t> other) {
return ufunc::add(self, other, _v_alpha);
});
})
...
```
The most interesting change about the generated code is what previously was an `AT_DISPATCH` macro invocation is now an unrolled loop. This makes it easier to vary behavior per-dtype (you can see in this example that the entry for bool and float differ) without having to add extra condtionals on top.
Otherwise, to generate this code, we have to hop through several successive API changes:
* In TORCH_IMPL_FUNC(ufunc_add_CPU), go from StructuredImplSignature to StubSignature (call the stub). This is normal argument massaging in the classic translate style.
* In add_kernel, go from StubSignature to UfuncSignature. This is nontrivial, because we must do various conversions outside of the inner kernel loop. These conversions are done by hand, setting up the context appropriately, and then the final ufunc call is done using translate. (BTW, I introduce a new convention here, call on a Signature, for code generating a C++ call, and I think we should try to use this convention elsewhere)
The other piece of nontrivial logic is the reindexing by dtype. This reindexing exists because the native_functions.yaml format is indexed by UfuncKey:
```
Generic: add (AllAndComplex, BFloat16, Half)
ScalarOnly: add (Bool)
```
but when we do code generation, we case on dtype first, and then we generate a `cpu_kernel` or `cpu_kernel_vec` call. We also don't care about CUDA code generation (which Generic) hits. Do this, we lower these keys into two low level keys, CPUScalar and CPUVector, which represent the CPU scalar and CPU vectorized ufuncs, respectively (Generic maps to CPUScalar and CPUVector, while ScalarOnly maps to CPUScalar only). Reindexing then gives us:
```
AllAndComplex:
CPUScalar: add
CPUVector: add
Bool:
CPUScalar: add
...
```
which is a good format for code generation, but too wordy to force native_functions.yaml authors to write. Note that when reindexing, it is possible for there to be a conflicting definition for the same dtype; we just define a precedence order and have one override the other, so that it is easy to specialize on a particular dtype if necessary. Also note that because CPUScalar/CPUVector are part of UfuncKey, technically you can manually specify them in native_functions.yaml, although I don't expect this functionality to be used.
**CUDA codegen.** CUDA code generation has many of the same ideas as CPU codegen, but it needs to know about functors, and stubs are handled slightly differently. Here is what we want to generate:
```
template <typename scalar_t>
struct CUDAFunctorOnSelf_add {
using opmath_t = at::opmath_type<scalar_t>;
opmath_t other_;
opmath_t alpha_;
CUDAFunctorOnSelf_add(opmath_t other, opmath_t alpha)
: other_(other), alpha_(alpha) {}
__device__ scalar_t operator()(scalar_t self) {
return ufunc::add(static_cast<opmath_t>(self), other_, alpha_);
}
};
... two more functors ...
void add_kernel(TensorIteratorBase& iter, const at::Scalar & alpha) {
TensorIteratorBase& iter = *this;
at::ScalarType st = iter.common_dtype();
RECORD_KERNEL_FUNCTION_DTYPE("ufunc_add_CUDA", st);
switch (st) {
AT_PRIVATE_CASE_TYPE("ufunc_add_CUDA", at::ScalarType::Bool, bool, [&]() {
using opmath_t = at::opmath_type<scalar_t>;
if (false) {
} else if (iter.is_cpu_scalar(1)) {
CUDAFunctorOnOther_add<scalar_t> ufunctor(
iter.scalar_value<opmath_t>(1), (alpha).to<opmath_t>());
iter.remove_operand(1);
gpu_kernel(iter, ufunctor);
} else if (iter.is_cpu_scalar(2)) {
CUDAFunctorOnSelf_add<scalar_t> ufunctor(
iter.scalar_value<opmath_t>(2), (alpha).to<opmath_t>());
iter.remove_operand(2);
gpu_kernel(iter, ufunctor);
} else {
gpu_kernel(iter, CUDAFunctor_add<scalar_t>((alpha).to<opmath_t>()));
}
})
...
REGISTER_DISPATCH(add_stub, &add_kernel);
TORCH_IMPL_FUNC(ufunc_add_CUDA)
(const at::Tensor& self,
const at::Tensor& other,
const at::Scalar& alpha,
const at::Tensor& out) {
add_kernel(*this, alpha);
}
```
The functor business is the bulk of the complexity. Like CPU, we decompose CUDA implementation into three low-level keys: CUDAFunctor (normal, all CUDA kernels will have this), and CUDAFunctorOnOther/CUDAFunctorOnScalar (these are to support Tensor-Scalar specializations when the Scalar lives on CPU). Both Generic and ScalarOnly provide ufuncs for CUDAFunctor, but for us to also lift these into Tensor-Scalar specializations, the operator itself must be eligible for Tensor-Scalar specialization. At the moment, this is hardcoded to be all binary operators, but in the future we can use tags in native_functions.yaml to disambiguate (or perhaps expand codegen to handle n-ary operators).
The reindexing process not only reassociates ufuncs by dtype, but it also works out if Tensor-Scalar specializations are needed and codegens the ufunctors necessary for the level of specialization here (`compute_ufunc_cuda_functors`). Generating the actual kernel (`compute_ufunc_cuda_dtype_body`) just consists of, for each specialization, constructing the functor and then passing it off to `gpu_kernel`. Most of the hard work is in functor generation, where we take care to make sure `operator()` has the correct input and output types (which `gpu_kernel` uses to arrange for memory accesses to the actual CUDA tensor; if you get these types wrong, your kernel will still work, it will just run very slowly!)
There is one big subtlety with CUDA codegen: this won't work:
```
Generic: add (AllAndComplex, BFloat16, Half)
ScalarOnly: add_bool (Bool)
```
This is because, even though there are separate Generic/ScalarOnly entries, we only generate a single functor to cover ALL dtypes in this case, and the functor has the ufunc name hardcoded into it. You'll get an error if you try to do this; to fix it, just make sure the ufunc is named the same consistently throughout. In the code, you see this because after testing for the short circuit case (when a user provided the functor themselves), we squash all the generic entries together and assert their ufunc names are the same. Hypothetically, if we generated a separate functor per dtype, we could support differently named ufuncs but... why would you do that to yourself. (One piece of nastiness is that the native_functions.yaml syntax doesn't stop you from shooting yourself in the foot.)
A brief word about CUDA stubs: technically, they are not necessary, as there is no CPU/CPUKernel style split for CUDA kernels (so, if you look, structured impl actually calls add_kernel directly). However, there is some code that still makes use of CUDA stubs (in particular, I use the stub to conveniently reimplement sub in terms of add), so we still register it. This might be worth frying some more at a later point in time.
**Build system changes.** If you are at FB, you should review these changes in fbcode, as there are several changes in files that are not exported to ShipIt.
The build system changes in this patch are substantively complicated by the fact that I have to implement these changes five times:
* OSS cmake build
* OSS Bazel build
* FB fbcode Buck build
* FB xplat Buck build (selective build)
* FB ovrsource Buck build
Due to technical limitations in the xplat Buck build related to selective build, it is required that you list every ufunc header manually (this is done in tools/build_variables.bzl)
The OSS cmake changes are entirely in cmake/Codegen.cmake there is a new set of files cpu_vec_generated (corresponding to UfuncCPUKernel files) which is wired up in the same way as other files. These files are different because they need to get compiled multiple times under different vectorization settings. I adjust the codegen, slightly refactoring the inner loop into its own function so I can use different base path calculation depending on if the file is traditional (in the native/cpu folder) or generated (new stuff from this diff.
The Bazel/Buck changes are organized around tools/build_variables.bzl, which contain the canonical list of ufunc headers (aten_ufunc_headers), and tools/ufunc_defs.bzl (added to ShipIt export list in D34465699) which defines a number of functions that compute the generated cpu, cpu kernel and cuda files based on the headers list. For convenience, these functions take a genpattern (a string with a {} for interpolation) which can be used to easily reformat the list of formats in target form, which is commonly needed in the build systems.
The split between build_variables.bzl and ufunc_defs.bzl is required because build_variables.bzl is executed by a conventional Python interpreter as part of the OSS cmake, but we require Skylark features to implement the functions in ufunc_defs.bzl (I did some quick Googling but didn't find a lightweight way to run the Skylark interpreter in open source.)
With these new file lists, the rest of the build changes are mostly inserting references to these files wherever necessary; in particular, cpu kernel files have to be worked into the multiple vectorization build flow (intern_build_aten_ops in OSS Bazel). Most of the subtlety relates to selective build. Selective build requires operator files to be copied per overall selective build; as dhruvbird explains to me, glob expansion happens during the action graph phase, but the selective build handling of TEMPLATE_SOURCE_LIST is referencing the target graph. In other words, we can't use a glob to generate deps for another rule, because we need to copy files from wherever (included generated files) to a staging folder so the rules can pick them up.
It can be somewhat confusing to understand which bzl files are associated with which build. Here are the relevant mappings for files I edited:
* Used by everyone - tools/build_tools.bzl, tools/ufunc_defs.bzl
* OSS Bazel - aten.bzl, BUILD.bazel
* FB fbcode Buck - TARGETS
* FB xplat Buck -BUCK, pt_defs.bzl, pt_template_srcs.bzl
* FB ovrsource Buck - ovrsource_defs.bzl, pt_defs.bzl
Note that pt_defs.bzl is used by both xplat and ovrsource. This leads to the "tiresome" handling for enabled backends, as selective build is CPU only, but ovrsource is CPU and CUDA.
BTW, while I was at it, I beefed up fb/build_arvr.sh to also do a CUDA ovrsource build, which was not triggered previously.
Signed-off-by: Edward Z. Yang <ezyang@fb.com>
Test Plan: Imported from OSS
Reviewed By: albanD
Differential Revision: D31306586
Pulled By: ezyang
fbshipit-source-id: 210258ce83f578f79cf91b77bfaeac34945a00c6
(cherry picked from commit d65157b0b894b6701ee062f05a5f57790a06c91c)
2022-02-28 23:46:04 +00:00
|
|
|
]) + aten_ufunc_generated_cuda_sources("aten/src/ATen/{}"),
|
|
|
|
|
# It's a bit puzzling to me why it's not necessary to declare the
|
|
|
|
|
# target that generates these sources...
|
2020-04-07 05:48:33 +00:00
|
|
|
)
|
|
|
|
|
|
2020-08-27 19:09:34 +00:00
|
|
|
header_template_rule(
|
2020-04-07 05:48:33 +00:00
|
|
|
name = "aten_src_ATen_config",
|
|
|
|
|
src = "aten/src/ATen/Config.h.in",
|
|
|
|
|
out = "aten/src/ATen/Config.h",
|
2021-12-17 21:41:24 +00:00
|
|
|
include = "aten/src",
|
2020-04-07 05:48:33 +00:00
|
|
|
substitutions = {
|
|
|
|
|
"@AT_MKLDNN_ENABLED@": "1",
|
2023-04-01 04:25:57 +00:00
|
|
|
"@AT_MKLDNN_ACL_ENABLED@": "0",
|
2021-12-17 21:41:24 +00:00
|
|
|
"@AT_MKL_ENABLED@": "1",
|
2021-12-22 21:50:47 +00:00
|
|
|
"@AT_MKL_SEQUENTIAL@": "0",
|
2021-06-30 23:27:07 +00:00
|
|
|
"@AT_POCKETFFT_ENABLED@": "0",
|
2020-04-07 05:48:33 +00:00
|
|
|
"@AT_NNPACK_ENABLED@": "0",
|
|
|
|
|
"@CAFFE2_STATIC_LINK_CUDA_INT@": "0",
|
2021-06-22 17:53:55 +00:00
|
|
|
"@AT_BUILD_WITH_BLAS@": "1",
|
|
|
|
|
"@AT_BUILD_WITH_LAPACK@": "1",
|
2020-06-19 03:17:48 +00:00
|
|
|
"@AT_PARALLEL_OPENMP@": "0",
|
|
|
|
|
"@AT_PARALLEL_NATIVE@": "1",
|
2021-06-22 17:53:55 +00:00
|
|
|
"@AT_BLAS_F2C@": "0",
|
|
|
|
|
"@AT_BLAS_USE_CBLAS_DOT@": "1",
|
2020-04-07 05:48:33 +00:00
|
|
|
},
|
|
|
|
|
)
|
|
|
|
|
|
2020-08-27 19:09:34 +00:00
|
|
|
header_template_rule(
|
2020-04-07 05:48:33 +00:00
|
|
|
name = "aten_src_ATen_cuda_config",
|
|
|
|
|
src = "aten/src/ATen/cuda/CUDAConfig.h.in",
|
|
|
|
|
out = "aten/src/ATen/cuda/CUDAConfig.h",
|
2021-12-17 21:41:24 +00:00
|
|
|
include = "aten/src",
|
2020-04-07 05:48:33 +00:00
|
|
|
substitutions = {
|
|
|
|
|
"@AT_CUDNN_ENABLED@": "1",
|
2023-08-01 19:23:21 +00:00
|
|
|
"@AT_CUSPARSELT_ENABLED@": "0",
|
2020-04-07 05:48:33 +00:00
|
|
|
"@AT_ROCM_ENABLED@": "0",
|
2021-10-14 00:48:28 +00:00
|
|
|
"@AT_MAGMA_ENABLED@": "0",
|
2020-04-07 05:48:33 +00:00
|
|
|
"@NVCC_FLAGS_EXTRA@": "",
|
|
|
|
|
},
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
cc_library(
|
|
|
|
|
name = "aten_headers",
|
|
|
|
|
hdrs = [
|
2021-12-07 23:22:44 +00:00
|
|
|
"torch/csrc/Export.h",
|
2020-04-07 05:48:33 +00:00
|
|
|
"torch/csrc/jit/frontend/function_schema_parser.h",
|
2023-03-26 10:27:18 +00:00
|
|
|
] + glob(
|
|
|
|
|
[
|
|
|
|
|
"aten/src/**/*.h",
|
|
|
|
|
"aten/src/**/*.hpp",
|
|
|
|
|
"aten/src/ATen/cuda/**/*.cuh",
|
|
|
|
|
"aten/src/ATen/native/**/*.cuh",
|
|
|
|
|
"aten/src/THC/*.cuh",
|
|
|
|
|
],
|
2021-12-17 21:41:24 +00:00
|
|
|
) + [
|
2020-04-07 05:48:33 +00:00
|
|
|
":aten_src_ATen_config",
|
2022-01-05 14:55:27 +00:00
|
|
|
":generated_aten_cpp",
|
2020-04-07 05:48:33 +00:00
|
|
|
],
|
|
|
|
|
includes = [
|
|
|
|
|
"aten/src",
|
|
|
|
|
],
|
|
|
|
|
deps = [
|
2023-04-05 16:16:05 +00:00
|
|
|
"//c10",
|
2020-04-07 05:48:33 +00:00
|
|
|
],
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
ATEN_COPTS = COMMON_COPTS + [
|
|
|
|
|
"-DCAFFE2_BUILD_MAIN_LIBS",
|
|
|
|
|
"-DHAVE_AVX_CPU_DEFINITION",
|
|
|
|
|
"-DHAVE_AVX2_CPU_DEFINITION",
|
|
|
|
|
"-fvisibility-inlines-hidden",
|
|
|
|
|
"-fno-math-errno",
|
|
|
|
|
"-fno-trapping-math",
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
intern_build_aten_ops(
|
|
|
|
|
copts = ATEN_COPTS,
|
2023-03-26 10:27:18 +00:00
|
|
|
extra_impls = aten_ufunc_generated_cpu_kernel_sources("aten/src/ATen/{}"),
|
2020-04-07 05:48:33 +00:00
|
|
|
deps = [
|
|
|
|
|
":aten_headers",
|
|
|
|
|
"@fbgemm",
|
2021-12-17 21:41:24 +00:00
|
|
|
"@mkl",
|
2023-03-26 10:27:18 +00:00
|
|
|
"@sleef",
|
2024-09-07 04:42:54 +00:00
|
|
|
"@mkl_dnn//:mkl-dnn",
|
2020-04-07 05:48:33 +00:00
|
|
|
],
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
cc_library(
|
|
|
|
|
name = "aten",
|
|
|
|
|
srcs = [
|
|
|
|
|
":ATen_CORE_SRCS",
|
|
|
|
|
":ATen_QUANTIZED_SRCS",
|
|
|
|
|
":aten_base_cpp",
|
2020-10-13 08:44:36 +00:00
|
|
|
":aten_base_metal",
|
2020-08-07 16:04:01 +00:00
|
|
|
":aten_base_vulkan",
|
2020-04-07 05:48:33 +00:00
|
|
|
":aten_native_cpp",
|
|
|
|
|
":aten_native_mkl_cpp",
|
|
|
|
|
":aten_native_mkldnn_cpp",
|
2023-03-26 10:27:18 +00:00
|
|
|
":aten_native_nested_cpp",
|
2020-04-07 05:48:33 +00:00
|
|
|
":aten_native_quantized_cpp",
|
|
|
|
|
":aten_native_sparse_cpp",
|
2022-04-26 00:02:39 +00:00
|
|
|
":aten_native_transformers_cpp",
|
2020-04-07 05:48:33 +00:00
|
|
|
":aten_native_xnnpack",
|
|
|
|
|
":aten_src_ATen_config",
|
ufunc codegen (#65851)
Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/65851
Design doc: https://docs.google.com/document/d/12rtlHnPUpaJ-I52Iob3L0WA3rKRr_OY7fXqeCvn2MVY/edit
First read the design doc to understand the user syntax. In this PR, we have converted add to use ufunc codegen; most of the cpp changes are deleting the preexisting implementations of add, and ufunc/add.h are the new implementations in the ufunc format.
The bulk of this PR is in the new codegen machinery. Here's the order to read the files:
* `tools/codegen/model.py`
* Some self-explanatory utility classes: `ScalarType`, `DTYPE_CLASSES`
* New classes for representing ufunc entries in `native_functions.yaml`: `UfuncKey` and `UfuncInnerLoop`, as well as parsing logic for these entries. UfuncKey has some unusual entries (e.g., CPUScalar) that don't show up in the documentation, more on these below).
* A predicate `is_ufunc_dispatch_key` for testing which dispatch keys should get automatically generated when an operator opts into ufuncs (CPU and CUDA, for now!)
* `tools/codegen/api/types.py`
* More self-explanatory utility stuff: ScalarTypeToCppMapping mapping ScalarType to CppTypes; Binding.rename for changing the name of a binding (used when we assign constructor variables to member variables inside CUDA functors)
* New VectorizedCType, representing `at::vec::Vectorized<T>`. This is used inside vectorized CPU codegen.
* New `scalar_t` and `opmath_t` BaseCppTypes, representing template parameters that we work with when doing codegen inside ufunc kernel loops (e.g., where you previously had Tensor, now you have `scalar_t`)
* `StructuredImplSignature` represents a `TORCH_IMPL_FUNC` definition, and straightforwardly follows from preexisting `tools.codegen.api.structured`
* `tools/codegen/translate.py` - Yes, we use translate a LOT in this PR. I improved some of the documentation, the only substantive changes are adding two new conversions: given a `scalar_t` or a `const Scalar&`, make it convertible to an `opmath_t`
* `tools/codegen/api/ufunc.py`
* OK, now we're at the meaty stuff. This file represents the calling conventions of three important concepts in ufunc codegen, which we'll describe shortly. All of these APIs are relatively simple, since there aren't any complicated types by the time you get to kernels.
* stubs are the DispatchStub trampolines that CPU kernels use to get to their vectorized versions. They drop all Tensor arguments (as they are in TensorIterator) but otherwise match the structured calling convention
* ufuncs are the inner loop template functions that you wrote in ufunc/add.h which do the actual computation in question. Here, all the Tensors and Scalars have been converted into the computation type (`opmath_t` in CUDA, `scalar_t` in CPU)
* ufunctors are a CUDA-only concept representing functors that take some of their arguments on a host-side constructor, and the rest in the device-side apply. Once again, Tensors and Scalars are converted into the computation type, `opmath_t`, but for clarity all the functions take `scalar_t` as argument (as this is the type that is most salient at the call site). Because the constructor and apply are code generated separately, `ufunctor_arguments` returns a teeny struct `UfunctorBindings`
* `tools/codegen/dest/ufunc.py` - the workhorse. This gets its own section below.
* `tools/codegen/gen.py` - just calling out to the new dest.ufunc implementation to generate UfuncCPU_add.cpp, UFuncCPUKernel_add.cpp and UfuncCUDA_add.cu files per ufunc operator. Each of these files does what you expect (small file that registers kernel and calls stub; CPU implementation; CUDA implementation). There is a new file manager for UFuncCPUKernel files as these need to get replicated by cmake for vectorization. One little trick to avoid recompilation is we directly replicate code generated forward declarations in these files, to reduce the number of headers we depend on (this is codegen, we're just doing the preprocessors job!)
* I'll talk about build system adjustments below.
OK, let's talk about tools/codegen/dest/ufunc.py. This file can be roughly understood in two halves: one for CPU code generation, and the other for CUDA code generation.
**CPU codegen.** Here's roughly what we want to generate:
```
// in UfuncCPU_add.cpp
using add_fn = void (*)(TensorIteratorBase&, const at::Scalar&);
DECLARE_DISPATCH(add_fn, add_stub);
DEFINE_DISPATCH(add_stub);
TORCH_IMPL_FUNC(ufunc_add_CPU)
(const at::Tensor& self, const at::Tensor& other, const at::Scalar& alpha, const at::Tensor& out) {
add_stub(device_type(), *this, alpha);
}
// in UfuncCPUKernel_add.cpp
void add_kernel(TensorIteratorBase& iter, const at::Scalar& alpha) {
at::ScalarType st = iter.common_dtype();
RECORD_KERNEL_FUNCTION_DTYPE("add_stub", st);
switch (st) {
AT_PRIVATE_CASE_TYPE("add_stub", at::ScalarType::Bool, bool, [&]() {
auto _s_alpha = alpha.to<scalar_t>();
cpu_kernel(iter, [=](scalar_t self, scalar_t other) {
return ufunc::add(self, other, _s_alpha);
});
})
AT_PRIVATE_CASE_TYPE(
"add_stub", at::ScalarType::ComplexFloat, c10::complex<float>, [&]() {
auto _s_alpha = alpha.to<scalar_t>();
auto _v_alpha = at::vec::Vectorized<scalar_t>(_s_alpha);
cpu_kernel_vec(
iter,
[=](scalar_t self, scalar_t other) {
return ufunc::add(self, other, _s_alpha);
},
[=](at::vec::Vectorized<scalar_t> self,
at::vec::Vectorized<scalar_t> other) {
return ufunc::add(self, other, _v_alpha);
});
})
...
```
The most interesting change about the generated code is what previously was an `AT_DISPATCH` macro invocation is now an unrolled loop. This makes it easier to vary behavior per-dtype (you can see in this example that the entry for bool and float differ) without having to add extra condtionals on top.
Otherwise, to generate this code, we have to hop through several successive API changes:
* In TORCH_IMPL_FUNC(ufunc_add_CPU), go from StructuredImplSignature to StubSignature (call the stub). This is normal argument massaging in the classic translate style.
* In add_kernel, go from StubSignature to UfuncSignature. This is nontrivial, because we must do various conversions outside of the inner kernel loop. These conversions are done by hand, setting up the context appropriately, and then the final ufunc call is done using translate. (BTW, I introduce a new convention here, call on a Signature, for code generating a C++ call, and I think we should try to use this convention elsewhere)
The other piece of nontrivial logic is the reindexing by dtype. This reindexing exists because the native_functions.yaml format is indexed by UfuncKey:
```
Generic: add (AllAndComplex, BFloat16, Half)
ScalarOnly: add (Bool)
```
but when we do code generation, we case on dtype first, and then we generate a `cpu_kernel` or `cpu_kernel_vec` call. We also don't care about CUDA code generation (which Generic) hits. Do this, we lower these keys into two low level keys, CPUScalar and CPUVector, which represent the CPU scalar and CPU vectorized ufuncs, respectively (Generic maps to CPUScalar and CPUVector, while ScalarOnly maps to CPUScalar only). Reindexing then gives us:
```
AllAndComplex:
CPUScalar: add
CPUVector: add
Bool:
CPUScalar: add
...
```
which is a good format for code generation, but too wordy to force native_functions.yaml authors to write. Note that when reindexing, it is possible for there to be a conflicting definition for the same dtype; we just define a precedence order and have one override the other, so that it is easy to specialize on a particular dtype if necessary. Also note that because CPUScalar/CPUVector are part of UfuncKey, technically you can manually specify them in native_functions.yaml, although I don't expect this functionality to be used.
**CUDA codegen.** CUDA code generation has many of the same ideas as CPU codegen, but it needs to know about functors, and stubs are handled slightly differently. Here is what we want to generate:
```
template <typename scalar_t>
struct CUDAFunctorOnSelf_add {
using opmath_t = at::opmath_type<scalar_t>;
opmath_t other_;
opmath_t alpha_;
CUDAFunctorOnSelf_add(opmath_t other, opmath_t alpha)
: other_(other), alpha_(alpha) {}
__device__ scalar_t operator()(scalar_t self) {
return ufunc::add(static_cast<opmath_t>(self), other_, alpha_);
}
};
... two more functors ...
void add_kernel(TensorIteratorBase& iter, const at::Scalar & alpha) {
TensorIteratorBase& iter = *this;
at::ScalarType st = iter.common_dtype();
RECORD_KERNEL_FUNCTION_DTYPE("ufunc_add_CUDA", st);
switch (st) {
AT_PRIVATE_CASE_TYPE("ufunc_add_CUDA", at::ScalarType::Bool, bool, [&]() {
using opmath_t = at::opmath_type<scalar_t>;
if (false) {
} else if (iter.is_cpu_scalar(1)) {
CUDAFunctorOnOther_add<scalar_t> ufunctor(
iter.scalar_value<opmath_t>(1), (alpha).to<opmath_t>());
iter.remove_operand(1);
gpu_kernel(iter, ufunctor);
} else if (iter.is_cpu_scalar(2)) {
CUDAFunctorOnSelf_add<scalar_t> ufunctor(
iter.scalar_value<opmath_t>(2), (alpha).to<opmath_t>());
iter.remove_operand(2);
gpu_kernel(iter, ufunctor);
} else {
gpu_kernel(iter, CUDAFunctor_add<scalar_t>((alpha).to<opmath_t>()));
}
})
...
REGISTER_DISPATCH(add_stub, &add_kernel);
TORCH_IMPL_FUNC(ufunc_add_CUDA)
(const at::Tensor& self,
const at::Tensor& other,
const at::Scalar& alpha,
const at::Tensor& out) {
add_kernel(*this, alpha);
}
```
The functor business is the bulk of the complexity. Like CPU, we decompose CUDA implementation into three low-level keys: CUDAFunctor (normal, all CUDA kernels will have this), and CUDAFunctorOnOther/CUDAFunctorOnScalar (these are to support Tensor-Scalar specializations when the Scalar lives on CPU). Both Generic and ScalarOnly provide ufuncs for CUDAFunctor, but for us to also lift these into Tensor-Scalar specializations, the operator itself must be eligible for Tensor-Scalar specialization. At the moment, this is hardcoded to be all binary operators, but in the future we can use tags in native_functions.yaml to disambiguate (or perhaps expand codegen to handle n-ary operators).
The reindexing process not only reassociates ufuncs by dtype, but it also works out if Tensor-Scalar specializations are needed and codegens the ufunctors necessary for the level of specialization here (`compute_ufunc_cuda_functors`). Generating the actual kernel (`compute_ufunc_cuda_dtype_body`) just consists of, for each specialization, constructing the functor and then passing it off to `gpu_kernel`. Most of the hard work is in functor generation, where we take care to make sure `operator()` has the correct input and output types (which `gpu_kernel` uses to arrange for memory accesses to the actual CUDA tensor; if you get these types wrong, your kernel will still work, it will just run very slowly!)
There is one big subtlety with CUDA codegen: this won't work:
```
Generic: add (AllAndComplex, BFloat16, Half)
ScalarOnly: add_bool (Bool)
```
This is because, even though there are separate Generic/ScalarOnly entries, we only generate a single functor to cover ALL dtypes in this case, and the functor has the ufunc name hardcoded into it. You'll get an error if you try to do this; to fix it, just make sure the ufunc is named the same consistently throughout. In the code, you see this because after testing for the short circuit case (when a user provided the functor themselves), we squash all the generic entries together and assert their ufunc names are the same. Hypothetically, if we generated a separate functor per dtype, we could support differently named ufuncs but... why would you do that to yourself. (One piece of nastiness is that the native_functions.yaml syntax doesn't stop you from shooting yourself in the foot.)
A brief word about CUDA stubs: technically, they are not necessary, as there is no CPU/CPUKernel style split for CUDA kernels (so, if you look, structured impl actually calls add_kernel directly). However, there is some code that still makes use of CUDA stubs (in particular, I use the stub to conveniently reimplement sub in terms of add), so we still register it. This might be worth frying some more at a later point in time.
**Build system changes.** If you are at FB, you should review these changes in fbcode, as there are several changes in files that are not exported to ShipIt.
The build system changes in this patch are substantively complicated by the fact that I have to implement these changes five times:
* OSS cmake build
* OSS Bazel build
* FB fbcode Buck build
* FB xplat Buck build (selective build)
* FB ovrsource Buck build
Due to technical limitations in the xplat Buck build related to selective build, it is required that you list every ufunc header manually (this is done in tools/build_variables.bzl)
The OSS cmake changes are entirely in cmake/Codegen.cmake there is a new set of files cpu_vec_generated (corresponding to UfuncCPUKernel files) which is wired up in the same way as other files. These files are different because they need to get compiled multiple times under different vectorization settings. I adjust the codegen, slightly refactoring the inner loop into its own function so I can use different base path calculation depending on if the file is traditional (in the native/cpu folder) or generated (new stuff from this diff.
The Bazel/Buck changes are organized around tools/build_variables.bzl, which contain the canonical list of ufunc headers (aten_ufunc_headers), and tools/ufunc_defs.bzl (added to ShipIt export list in D34465699) which defines a number of functions that compute the generated cpu, cpu kernel and cuda files based on the headers list. For convenience, these functions take a genpattern (a string with a {} for interpolation) which can be used to easily reformat the list of formats in target form, which is commonly needed in the build systems.
The split between build_variables.bzl and ufunc_defs.bzl is required because build_variables.bzl is executed by a conventional Python interpreter as part of the OSS cmake, but we require Skylark features to implement the functions in ufunc_defs.bzl (I did some quick Googling but didn't find a lightweight way to run the Skylark interpreter in open source.)
With these new file lists, the rest of the build changes are mostly inserting references to these files wherever necessary; in particular, cpu kernel files have to be worked into the multiple vectorization build flow (intern_build_aten_ops in OSS Bazel). Most of the subtlety relates to selective build. Selective build requires operator files to be copied per overall selective build; as dhruvbird explains to me, glob expansion happens during the action graph phase, but the selective build handling of TEMPLATE_SOURCE_LIST is referencing the target graph. In other words, we can't use a glob to generate deps for another rule, because we need to copy files from wherever (included generated files) to a staging folder so the rules can pick them up.
It can be somewhat confusing to understand which bzl files are associated with which build. Here are the relevant mappings for files I edited:
* Used by everyone - tools/build_tools.bzl, tools/ufunc_defs.bzl
* OSS Bazel - aten.bzl, BUILD.bazel
* FB fbcode Buck - TARGETS
* FB xplat Buck -BUCK, pt_defs.bzl, pt_template_srcs.bzl
* FB ovrsource Buck - ovrsource_defs.bzl, pt_defs.bzl
Note that pt_defs.bzl is used by both xplat and ovrsource. This leads to the "tiresome" handling for enabled backends, as selective build is CPU only, but ovrsource is CPU and CUDA.
BTW, while I was at it, I beefed up fb/build_arvr.sh to also do a CUDA ovrsource build, which was not triggered previously.
Signed-off-by: Edward Z. Yang <ezyang@fb.com>
Test Plan: Imported from OSS
Reviewed By: albanD
Differential Revision: D31306586
Pulled By: ezyang
fbshipit-source-id: 210258ce83f578f79cf91b77bfaeac34945a00c6
(cherry picked from commit d65157b0b894b6701ee062f05a5f57790a06c91c)
2022-02-28 23:46:04 +00:00
|
|
|
] + generated_cpu_cpp + aten_ufunc_generated_cpu_sources("aten/src/ATen/{}"),
|
2020-04-07 05:48:33 +00:00
|
|
|
copts = ATEN_COPTS,
|
2024-05-31 01:20:45 +00:00
|
|
|
linkopts = [
|
|
|
|
|
"-ldl",
|
|
|
|
|
],
|
2020-04-07 05:48:33 +00:00
|
|
|
data = if_cuda(
|
|
|
|
|
[":libcaffe2_nvrtc.so"],
|
|
|
|
|
[],
|
|
|
|
|
),
|
|
|
|
|
visibility = ["//visibility:public"],
|
|
|
|
|
deps = [
|
|
|
|
|
":ATen_CPU",
|
|
|
|
|
":aten_headers",
|
|
|
|
|
":caffe2_for_aten_headers",
|
|
|
|
|
":torch_headers",
|
|
|
|
|
"@fbgemm",
|
|
|
|
|
"@ideep",
|
|
|
|
|
],
|
|
|
|
|
alwayslink = True,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
cc_library(
|
|
|
|
|
name = "aten_nvrtc",
|
|
|
|
|
srcs = glob([
|
|
|
|
|
"aten/src/ATen/cuda/nvrtc_stub/*.cpp",
|
|
|
|
|
]),
|
|
|
|
|
copts = ATEN_COPTS,
|
|
|
|
|
linkstatic = True,
|
|
|
|
|
visibility = ["//visibility:public"],
|
|
|
|
|
deps = [
|
|
|
|
|
":aten_headers",
|
2023-04-05 16:16:05 +00:00
|
|
|
"//c10",
|
2020-04-07 05:48:33 +00:00
|
|
|
"@cuda",
|
|
|
|
|
"@cuda//:cuda_driver",
|
|
|
|
|
"@cuda//:nvrtc",
|
|
|
|
|
],
|
|
|
|
|
alwayslink = True,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
cc_binary(
|
|
|
|
|
name = "libcaffe2_nvrtc.so",
|
|
|
|
|
linkshared = True,
|
|
|
|
|
visibility = ["//visibility:public"],
|
|
|
|
|
deps = [
|
|
|
|
|
":aten_nvrtc",
|
|
|
|
|
],
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
cc_library(
|
|
|
|
|
name = "aten_cuda_cpp",
|
2022-01-05 14:55:27 +00:00
|
|
|
srcs = [":aten_cuda_cpp_srcs"] + generated_cuda_cpp,
|
2021-12-17 21:41:24 +00:00
|
|
|
hdrs = [":aten_src_ATen_cuda_config"],
|
2020-04-07 05:48:33 +00:00
|
|
|
copts = ATEN_COPTS,
|
|
|
|
|
visibility = ["//visibility:public"],
|
|
|
|
|
deps = [
|
|
|
|
|
":aten",
|
|
|
|
|
"@cuda",
|
2021-12-17 21:41:24 +00:00
|
|
|
"@cuda//:cusolver",
|
2020-04-07 05:48:33 +00:00
|
|
|
"@cuda//:nvrtc",
|
|
|
|
|
"@cudnn",
|
2024-01-03 15:41:28 +00:00
|
|
|
"@cudnn_frontend",
|
2020-04-07 05:48:33 +00:00
|
|
|
],
|
|
|
|
|
alwayslink = True,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
torch_cuda_half_options = [
|
|
|
|
|
"-DCUDA_HAS_FP16=1",
|
|
|
|
|
"-D__CUDA_NO_HALF_OPERATORS__",
|
2023-02-13 19:03:36 +00:00
|
|
|
"-D__CUDA_NO_HALF_CONVERSIONS__",
|
2020-10-02 23:19:14 +00:00
|
|
|
"-D__CUDA_NO_BFLOAT16_CONVERSIONS__",
|
2020-04-07 05:48:33 +00:00
|
|
|
"-D__CUDA_NO_HALF2_OPERATORS__",
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
cu_library(
|
|
|
|
|
name = "aten_cuda",
|
2021-12-17 21:41:24 +00:00
|
|
|
srcs = [":aten_cu_srcs"],
|
2020-04-07 05:48:33 +00:00
|
|
|
copts = ATEN_COPTS + torch_cuda_half_options,
|
|
|
|
|
visibility = ["//visibility:public"],
|
|
|
|
|
deps = [
|
|
|
|
|
":aten_cuda_cpp",
|
2023-04-06 12:54:20 +00:00
|
|
|
"//c10/util:bit_cast",
|
2020-04-07 05:48:33 +00:00
|
|
|
"@cuda//:cublas",
|
|
|
|
|
"@cuda//:cufft",
|
|
|
|
|
"@cuda//:cusparse",
|
2022-10-18 23:11:47 +00:00
|
|
|
"@cutlass",
|
2020-04-07 05:48:33 +00:00
|
|
|
],
|
|
|
|
|
alwayslink = True,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# caffe2
|
|
|
|
|
CAFFE2_COPTS = COMMON_COPTS + [
|
|
|
|
|
"-Dcaffe2_EXPORTS",
|
|
|
|
|
"-DCAFFE2_USE_CUDNN",
|
|
|
|
|
"-DCAFFE2_BUILD_MAIN_LIB",
|
|
|
|
|
"-fvisibility-inlines-hidden",
|
|
|
|
|
"-fno-math-errno",
|
|
|
|
|
"-fno-trapping-math",
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
filegroup(
|
|
|
|
|
name = "caffe2_core_srcs",
|
|
|
|
|
srcs = [
|
|
|
|
|
"caffe2/core/common.cc",
|
|
|
|
|
],
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
filegroup(
|
|
|
|
|
name = "caffe2_perfkernels_srcs",
|
|
|
|
|
srcs = [
|
|
|
|
|
"caffe2/perfkernels/embedding_lookup_idx.cc",
|
|
|
|
|
],
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
filegroup(
|
|
|
|
|
name = "caffe2_serialize_srcs",
|
|
|
|
|
srcs = [
|
|
|
|
|
"caffe2/serialize/file_adapter.cc",
|
|
|
|
|
"caffe2/serialize/inline_container.cc",
|
|
|
|
|
"caffe2/serialize/istream_adapter.cc",
|
2024-09-13 16:42:37 +00:00
|
|
|
"caffe2/serialize/read_adapter_interface.cc",
|
2020-04-07 05:48:33 +00:00
|
|
|
],
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
filegroup(
|
|
|
|
|
name = "caffe2_utils_srcs",
|
|
|
|
|
srcs = [
|
|
|
|
|
"caffe2/utils/proto_wrap.cc",
|
|
|
|
|
"caffe2/utils/threadpool/ThreadPool.cc",
|
|
|
|
|
"caffe2/utils/threadpool/pthreadpool.cc",
|
|
|
|
|
"caffe2/utils/threadpool/pthreadpool_impl.cc",
|
2021-03-03 19:37:36 +00:00
|
|
|
"caffe2/utils/threadpool/thread_pool_guard.cpp",
|
2020-04-07 05:48:33 +00:00
|
|
|
],
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# To achieve finer granularity and make debug easier, caffe2 is split into three libraries:
|
|
|
|
|
# ATen, caffe2 and caffe2_for_aten_headers. ATen lib group up source codes under
|
|
|
|
|
# aten/ directory and caffe2 contains most files under `caffe2/` directory. Since the
|
|
|
|
|
# ATen lib and the caffe2 lib would depend on each other, `caffe2_for_aten_headers` is splitted
|
|
|
|
|
# out from `caffe2` to avoid dependency cycle.
|
|
|
|
|
cc_library(
|
|
|
|
|
name = "caffe2_for_aten_headers",
|
|
|
|
|
hdrs = [
|
|
|
|
|
"caffe2/core/common.h",
|
|
|
|
|
"caffe2/perfkernels/common.h",
|
|
|
|
|
"caffe2/perfkernels/embedding_lookup_idx.h",
|
2023-03-26 10:27:18 +00:00
|
|
|
"caffe2/utils/fixed_divisor.h",
|
2020-04-07 05:48:33 +00:00
|
|
|
] + glob([
|
|
|
|
|
"caffe2/utils/threadpool/*.h",
|
|
|
|
|
]),
|
|
|
|
|
copts = CAFFE2_COPTS,
|
|
|
|
|
visibility = ["//visibility:public"],
|
|
|
|
|
deps = [
|
2023-04-04 00:42:58 +00:00
|
|
|
":caffe2_core_macros",
|
2023-04-05 16:16:05 +00:00
|
|
|
"//c10",
|
2020-04-07 05:48:33 +00:00
|
|
|
],
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
cc_library(
|
|
|
|
|
name = "caffe2_headers",
|
2023-04-04 00:42:58 +00:00
|
|
|
hdrs = glob(
|
|
|
|
|
[
|
|
|
|
|
"caffe2/perfkernels/*.h",
|
|
|
|
|
"caffe2/serialize/*.h",
|
|
|
|
|
"caffe2/utils/*.h",
|
|
|
|
|
"caffe2/utils/threadpool/*.h",
|
|
|
|
|
"modules/**/*.h",
|
|
|
|
|
],
|
|
|
|
|
exclude = [
|
|
|
|
|
"caffe2/core/macros.h",
|
|
|
|
|
],
|
|
|
|
|
) + if_cuda(glob([
|
2020-04-07 05:48:33 +00:00
|
|
|
"caffe2/**/*.cuh",
|
2024-04-29 06:27:13 +00:00
|
|
|
])),
|
2020-04-07 05:48:33 +00:00
|
|
|
copts = CAFFE2_COPTS,
|
|
|
|
|
visibility = ["//visibility:public"],
|
|
|
|
|
deps = [
|
2023-04-04 00:42:58 +00:00
|
|
|
":caffe2_core_macros",
|
2020-04-07 05:48:33 +00:00
|
|
|
":caffe2_for_aten_headers",
|
|
|
|
|
],
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
cc_library(
|
|
|
|
|
name = "caffe2",
|
|
|
|
|
srcs = [
|
|
|
|
|
":caffe2_core_srcs",
|
|
|
|
|
":caffe2_perfkernels_srcs",
|
|
|
|
|
":caffe2_serialize_srcs",
|
|
|
|
|
":caffe2_utils_srcs",
|
|
|
|
|
],
|
|
|
|
|
copts = CAFFE2_COPTS + ["-mf16c"],
|
|
|
|
|
linkstatic = 1,
|
|
|
|
|
visibility = ["//visibility:public"],
|
|
|
|
|
deps = [
|
2023-04-04 00:42:58 +00:00
|
|
|
":caffe2_core_macros",
|
2023-03-26 10:27:18 +00:00
|
|
|
":caffe2_headers",
|
2020-04-07 05:48:33 +00:00
|
|
|
":caffe2_perfkernels_avx",
|
|
|
|
|
":caffe2_perfkernels_avx2",
|
2022-06-22 15:02:16 +00:00
|
|
|
"//third_party/miniz-2.1.0:miniz",
|
2020-04-07 05:48:33 +00:00
|
|
|
"@com_google_protobuf//:protobuf",
|
|
|
|
|
"@eigen",
|
2020-04-13 22:59:48 +00:00
|
|
|
"@fbgemm//:fbgemm_src_headers",
|
2023-03-26 10:27:18 +00:00
|
|
|
"@fmt",
|
2020-04-07 05:48:33 +00:00
|
|
|
"@onnx",
|
|
|
|
|
] + if_cuda(
|
|
|
|
|
[
|
|
|
|
|
":aten_cuda",
|
2020-09-21 17:16:40 +00:00
|
|
|
"@tensorpipe//:tensorpipe_cuda",
|
|
|
|
|
],
|
|
|
|
|
[
|
|
|
|
|
":aten",
|
2021-06-18 12:12:27 +00:00
|
|
|
"@tensorpipe//:tensorpipe_cpu",
|
2020-04-07 05:48:33 +00:00
|
|
|
],
|
|
|
|
|
),
|
|
|
|
|
alwayslink = True,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
cu_library(
|
2024-05-09 02:19:59 +00:00
|
|
|
name = "torch_cuda",
|
|
|
|
|
srcs = [
|
2023-12-20 04:16:42 +00:00
|
|
|
"torch/csrc/distributed/c10d/intra_node_comm.cu",
|
2024-08-29 05:39:36 +00:00
|
|
|
"torch/csrc/distributed/c10d/NanCheck.cu",
|
2023-12-20 04:16:42 +00:00
|
|
|
"torch/csrc/distributed/c10d/quantization/quantization_gpu.cu",
|
|
|
|
|
],
|
2024-05-09 02:19:59 +00:00
|
|
|
copts = torch_cuda_half_options,
|
2020-04-07 05:48:33 +00:00
|
|
|
visibility = ["//visibility:public"],
|
|
|
|
|
deps = [
|
|
|
|
|
":aten",
|
|
|
|
|
"@cuda//:cublas",
|
|
|
|
|
"@cuda//:curand",
|
|
|
|
|
"@cudnn",
|
|
|
|
|
"@eigen",
|
2020-09-21 17:16:40 +00:00
|
|
|
"@tensorpipe//:tensorpipe_cuda",
|
2020-04-07 05:48:33 +00:00
|
|
|
],
|
|
|
|
|
alwayslink = True,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
PERF_COPTS = [
|
|
|
|
|
"-DHAVE_AVX_CPU_DEFINITION",
|
|
|
|
|
"-DHAVE_AVX2_CPU_DEFINITION",
|
|
|
|
|
"-DENABLE_ALIAS=1",
|
|
|
|
|
"-DHAVE_MALLOC_USABLE_SIZE=1",
|
|
|
|
|
"-DHAVE_MMAP=1",
|
|
|
|
|
"-DHAVE_SHM_OPEN=1",
|
|
|
|
|
"-DHAVE_SHM_UNLINK=1",
|
|
|
|
|
"-DSLEEF_STATIC_LIBS=1",
|
2021-12-17 21:41:24 +00:00
|
|
|
"-DTH_BALS_MKL",
|
2020-04-07 05:48:33 +00:00
|
|
|
"-D_FILE_OFFSET_BITS=64",
|
|
|
|
|
"-DUSE_FBGEMM",
|
|
|
|
|
"-fvisibility-inlines-hidden",
|
|
|
|
|
"-Wunused-parameter",
|
|
|
|
|
"-fno-math-errno",
|
|
|
|
|
"-fno-trapping-math",
|
|
|
|
|
"-mf16c",
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
PERF_HEADERS = glob([
|
|
|
|
|
"caffe2/perfkernels/*.h",
|
|
|
|
|
"caffe2/core/*.h",
|
|
|
|
|
])
|
|
|
|
|
|
|
|
|
|
cc_library(
|
|
|
|
|
name = "caffe2_perfkernels_avx",
|
|
|
|
|
srcs = glob([
|
|
|
|
|
"caffe2/perfkernels/*_avx.cc",
|
|
|
|
|
]),
|
|
|
|
|
hdrs = PERF_HEADERS,
|
|
|
|
|
copts = PERF_COPTS + [
|
|
|
|
|
"-mavx",
|
|
|
|
|
],
|
|
|
|
|
visibility = ["//visibility:public"],
|
|
|
|
|
deps = [
|
|
|
|
|
":caffe2_headers",
|
2021-12-16 21:07:58 +00:00
|
|
|
"//c10",
|
2020-04-07 05:48:33 +00:00
|
|
|
],
|
|
|
|
|
alwayslink = True,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
cc_library(
|
|
|
|
|
name = "caffe2_perfkernels_avx2",
|
|
|
|
|
srcs = glob([
|
|
|
|
|
"caffe2/perfkernels/*_avx2.cc",
|
|
|
|
|
]),
|
|
|
|
|
hdrs = PERF_HEADERS,
|
|
|
|
|
copts = PERF_COPTS + [
|
|
|
|
|
"-mavx2",
|
|
|
|
|
"-mfma",
|
|
|
|
|
"-mavx",
|
|
|
|
|
],
|
|
|
|
|
visibility = ["//visibility:public"],
|
|
|
|
|
deps = [
|
|
|
|
|
":caffe2_headers",
|
2021-12-16 21:07:58 +00:00
|
|
|
"//c10",
|
2020-04-07 05:48:33 +00:00
|
|
|
],
|
|
|
|
|
alwayslink = True,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# torch
|
[NVFUSER] refactor nvfuser build (#89621)
This PR is the first step towards refactors the build for nvfuser in order to have the coegen being a standalone library.
Contents inside this PR:
1. nvfuser code base has been moved to `./nvfuser`, from `./torch/csrc/jit/codegen/cuda/`, except for registration code for integration (interface.h/interface.cpp)
2. splits the build system so nvfuser is generating its own `.so` files. Currently there are:
- `libnvfuser_codegen.so`, which contains the integration, codegen and runtime system of nvfuser
- `nvfuser.so`, which is nvfuser's python API via pybind. Python frontend is now exposed via `nvfuser._C.XXX` instead of `torch._C._nvfuser`
3. nvfuser cpp tests is currently being compiled into `nvfuser_tests`
4. cmake is refactored so that:
- nvfuser now has its own `CMakeLists.txt`, which is under `torch/csrc/jit/codegen/cuda/`.
- nvfuser backend code is not compiled inside `libtorch_cuda_xxx` any more
- nvfuser is added as a subdirectory under `./CMakeLists.txt` at the very end after torch is built.
- since nvfuser has dependency on torch, the registration of nvfuser at runtime is done via dlopen (`at::DynamicLibrary`). This avoids circular dependency in cmake, which will be a nightmare to handle. For details, look at `torch/csrc/jit/codegen/cuda/interface.cpp::LoadingNvfuserLibrary`
Future work that's scoped in following PR:
- Currently since nvfuser codegen has dependency on torch, we need to refactor that out so we can move nvfuser into a submodule and not rely on dlopen to load the library. @malfet
- Since we moved nvfuser into a cmake build, we effectively disabled bazel build for nvfuser. This could impact internal workload at Meta, so we need to put support back. cc'ing @vors
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89621
Approved by: https://github.com/davidberard98
2023-01-26 02:50:44 +00:00
|
|
|
torch_cuda_headers = glob(["torch/csrc/cuda/*.h"])
|
2021-12-17 21:41:24 +00:00
|
|
|
|
2020-04-07 05:48:33 +00:00
|
|
|
cc_library(
|
|
|
|
|
name = "torch_headers",
|
|
|
|
|
hdrs = if_cuda(
|
2020-05-19 05:00:55 +00:00
|
|
|
torch_cuda_headers,
|
2020-04-07 05:48:33 +00:00
|
|
|
) + glob(
|
|
|
|
|
[
|
|
|
|
|
"torch/*.h",
|
2020-05-19 05:00:55 +00:00
|
|
|
"torch/csrc/**/*.h",
|
2024-05-21 18:00:09 +00:00
|
|
|
"torch/csrc/distributed/c10d/**/*.hpp",
|
2020-04-07 05:48:33 +00:00
|
|
|
"torch/lib/libshm/*.h",
|
|
|
|
|
],
|
|
|
|
|
exclude = [
|
2023-03-21 21:34:58 +00:00
|
|
|
"torch/csrc/*/generated/*.h",
|
2020-05-19 05:00:55 +00:00
|
|
|
] + torch_cuda_headers,
|
2022-05-16 21:40:31 +00:00
|
|
|
) + GENERATED_AUTOGRAD_CPP + [":version_h"],
|
2020-04-07 05:48:33 +00:00
|
|
|
includes = [
|
2023-03-26 10:27:18 +00:00
|
|
|
"third_party/kineto/libkineto/include",
|
2020-04-07 05:48:33 +00:00
|
|
|
"torch/csrc",
|
|
|
|
|
"torch/csrc/api/include",
|
2021-06-24 19:37:29 +00:00
|
|
|
"torch/csrc/distributed",
|
2020-04-07 05:48:33 +00:00
|
|
|
"torch/lib",
|
|
|
|
|
"torch/lib/libshm",
|
|
|
|
|
],
|
|
|
|
|
visibility = ["//visibility:public"],
|
|
|
|
|
deps = [
|
|
|
|
|
":aten_headers",
|
|
|
|
|
":caffe2_headers",
|
2023-04-05 16:16:05 +00:00
|
|
|
"//c10",
|
2022-01-13 00:27:21 +00:00
|
|
|
"@com_github_google_flatbuffers//:flatbuffers",
|
2020-04-15 04:45:30 +00:00
|
|
|
"@local_config_python//:python_headers",
|
2020-04-07 05:48:33 +00:00
|
|
|
"@onnx",
|
|
|
|
|
],
|
|
|
|
|
alwayslink = True,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
TORCH_COPTS = COMMON_COPTS + [
|
|
|
|
|
"-Dtorch_EXPORTS",
|
|
|
|
|
"-DHAVE_AVX_CPU_DEFINITION",
|
|
|
|
|
"-DHAVE_AVX2_CPU_DEFINITION",
|
|
|
|
|
"-DCAFFE2_USE_GLOO",
|
|
|
|
|
"-fvisibility-inlines-hidden",
|
|
|
|
|
"-fno-math-errno ",
|
|
|
|
|
"-fno-trapping-math",
|
2023-06-02 22:04:39 +00:00
|
|
|
"-Wno-error=unused-function",
|
2020-04-07 05:48:33 +00:00
|
|
|
]
|
|
|
|
|
|
2023-03-26 10:27:18 +00:00
|
|
|
torch_sources = {
|
|
|
|
|
k: ""
|
|
|
|
|
for k in (
|
|
|
|
|
libtorch_core_sources +
|
|
|
|
|
libtorch_distributed_sources +
|
|
|
|
|
torch_cpp_srcs +
|
|
|
|
|
libtorch_extra_sources +
|
|
|
|
|
jit_core_sources +
|
|
|
|
|
lazy_tensor_ts_sources +
|
|
|
|
|
GENERATED_AUTOGRAD_CPP
|
|
|
|
|
)
|
|
|
|
|
}.keys()
|
2022-12-08 03:48:04 +00:00
|
|
|
|
2020-04-07 05:48:33 +00:00
|
|
|
cc_library(
|
|
|
|
|
name = "torch",
|
|
|
|
|
srcs = if_cuda(glob(
|
2021-12-17 21:41:24 +00:00
|
|
|
libtorch_cuda_sources,
|
2020-04-07 05:48:33 +00:00
|
|
|
exclude = [
|
|
|
|
|
"torch/csrc/cuda/python_nccl.cpp",
|
|
|
|
|
"torch/csrc/cuda/nccl.cpp",
|
2023-12-20 04:16:42 +00:00
|
|
|
"torch/csrc/distributed/c10d/intra_node_comm.cu",
|
Introduce a prototype for SymmetricMemory (#128582)
Stack from [ghstack](https://github.com/ezyang/ghstack) (oldest at bottom):
This PR introduces a prototype for `SymmetricMemory` (including a CUDA implementation) - a remote-memory access-based communication primitive. It allows for user-defined communication patterns/kernels and is designed to be torch.compile-friendly. It addresses the major limitations of `IntraNodeComm` and `ProcessGroupCudaP2p` and serves as a replacement for them.
### SymmetricMemory
`SymmetricMemory` represents symmetric allocations across a group of devices. The allocations represented by a `SymmetricMemory` object are accessible by all devices in the group. The class can be used for **op-level custom communication patterns** (via the get_buffer APIs and the synchronization primitives), as well as **custom communication kernels** (via the buffer and signal_pad device pointers).
### Python API Example
```python
from torch._C.distributed_c10d import _SymmetricMemory
# Set a store for rendezvousing symmetric allocations on a group of devices
# identified by group_name. The concept of groups is logical; users can
# utilize predefined groups (e.g., a group of device identified by a
# ProcessGroup) or create custom ones. Note that a SymmetricMemoryAllocator
# backends might employ a more efficient communication channel for the actual
# rendezvous process and only use the store for bootstrapping purposes.
_SymmetricMemory.set_group_info(group_name, rank, world_size, store)
# Identical to empty_strided, but allows symmetric memory access to be
# established for the allocated tensor via _SymmetricMemory.rendezvous().
# This function itself is not a collective operation.
t = _SymmetricMemory.empty_strided_p2p((64, 64), (64, 1), torch.float32, group_name)
# Users can write Python custom ops that leverages the symmetric memory access.
# Below are examples of things users can do (assuming the group's world_size is 2).
# Establishes symmetric memory access on tensors allocated via
# _SymmetricMemory.empty_strided_p2p(). rendezvous() is a one-time process,
# and the mapping between a local memory region and the associated SymmetricMemory
# object is unique. Subsequent calls to rendezvous() with the same tensor will receive
# the cached SymmetricMemory object.
#
# The function has a collective semantic and must be invoked simultaneously
# from all rendezvous participants.
symm_mem = _SymmetricMemory.rendezvous(t)
# This represents the allocation on rank 0 and is accessible from all devices.
buf = symm_mem.get_buffer(0, (64, 64), torch.float32)
if symm_mem.rank == 0:
symm_mem.wait_signal(src_rank=1)
assert buf.eq(42).all()
else:
# The remote buffer can be used as a regular tensor
buf.fill_(42)
symm_mem.put_signal(dst_rank=0)
symm_mem.barrier()
if symm_mem.rank == 0:
symm_mem.barrier()
assert buf.eq(43).all()
else:
new_val = torch.empty_like(buf)
new_val.fill_(43)
# Contiguous copies to/from a remote buffer utilize copy engines
# which bypasses SMs (i.e. no need to load the data into registers)
buf.copy_(new_val)
symm_mem.barrier()
```
### Custom CUDA Comm Kernels
Given a tensor, users can access the associated `SymmetricMemory` which provides pointer to remote buffers/signal_pads needed for custom communication kernels.
```cpp
TORCH_API c10::intrusive_ptr<SymmetricMemory> get_symmetric_memory(
const at::Tensor& tensor);
class TORCH_API SymmetricMemory : public c10::intrusive_ptr_target {
public:
...
virtual std::vector<void*> get_buffer_ptrs() = 0;
virtual std::vector<void*> get_signal_pad_ptrs() = 0;
virtual void** get_buffer_ptrs_dev() = 0;
virtual void** get_signal_pad_ptrs_dev() = 0;
virtual size_t get_buffer_size() = 0;
virtual size_t get_signal_pad_size() = 0;
virtual int get_rank() = 0;
virtual int get_world_size() = 0;
...
};
```
### Limitations of IntraNodeComm and ProcessGroupCudaP2p
Both `IntraNodeComm` (used by `ProcessGroupCudaP2p`) manages a single fixed-size workspace. This approach:
- Leads to awkward UX in which the required workspace needs to be specified upfront.
- Can not avoid extra copies for some algorithms in eager mode (e.g., custom/multimem all-reduce, reduce-scatter, all-gather).
- Prevents torch.compile from eliminating all copies.
In addition, they only offer out-of-the-box communication kernels and don't expose required pointers for user-defined, custom CUDA comm kernels.
* __->__ #128582
Differential Revision: [D58849033](https://our.internmc.facebook.com/intern/diff/D58849033)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/128582
Approved by: https://github.com/wanchaol
2024-06-20 23:54:19 +00:00
|
|
|
"torch/csrc/distributed/c10d/CUDASymmetricMemory.cu",
|
2024-08-23 04:02:50 +00:00
|
|
|
"torch/csrc/distributed/c10d/CUDASymmetricMemoryOps.cu",
|
2024-08-29 05:39:36 +00:00
|
|
|
"torch/csrc/distributed/c10d/NanCheck.cu",
|
2021-12-17 21:41:24 +00:00
|
|
|
"torch/csrc/distributed/c10d/quantization/quantization_gpu.cu",
|
2020-04-07 05:48:33 +00:00
|
|
|
],
|
2022-12-08 03:48:04 +00:00
|
|
|
)) + torch_sources,
|
2021-12-17 21:41:24 +00:00
|
|
|
copts = TORCH_COPTS,
|
2024-05-31 01:20:45 +00:00
|
|
|
linkopts = [
|
|
|
|
|
"-lrt",
|
|
|
|
|
],
|
2020-04-07 05:48:33 +00:00
|
|
|
defines = [
|
|
|
|
|
"CAFFE2_NIGHTLY_VERSION=20200115",
|
|
|
|
|
],
|
|
|
|
|
visibility = ["//visibility:public"],
|
|
|
|
|
deps = [
|
|
|
|
|
":caffe2",
|
|
|
|
|
":torch_headers",
|
2022-07-15 01:35:53 +00:00
|
|
|
"@kineto",
|
2024-06-04 19:44:30 +00:00
|
|
|
"@cpp-httplib",
|
2024-06-26 21:59:23 +00:00
|
|
|
"@nlohmann",
|
2021-12-17 21:41:24 +00:00
|
|
|
] + if_cuda([
|
|
|
|
|
"@cuda//:nvToolsExt",
|
2022-10-18 23:11:47 +00:00
|
|
|
"@cutlass",
|
2024-05-09 02:19:59 +00:00
|
|
|
":torch_cuda",
|
2021-12-17 21:41:24 +00:00
|
|
|
]),
|
2020-04-07 05:48:33 +00:00
|
|
|
alwayslink = True,
|
|
|
|
|
)
|
|
|
|
|
|
2020-05-20 05:49:28 +00:00
|
|
|
cc_library(
|
|
|
|
|
name = "shm",
|
|
|
|
|
srcs = glob(["torch/lib/libshm/*.cpp"]),
|
2024-05-31 01:20:45 +00:00
|
|
|
linkopts = [
|
|
|
|
|
"-lrt",
|
|
|
|
|
],
|
2020-05-20 05:49:28 +00:00
|
|
|
deps = [
|
2023-10-13 17:57:53 +00:00
|
|
|
":torch",
|
2020-05-20 05:49:28 +00:00
|
|
|
],
|
|
|
|
|
)
|
|
|
|
|
|
2020-04-07 05:48:33 +00:00
|
|
|
cc_library(
|
|
|
|
|
name = "libtorch_headers",
|
|
|
|
|
hdrs = glob([
|
|
|
|
|
"**/*.h",
|
|
|
|
|
"**/*.cuh",
|
|
|
|
|
]) + [
|
2022-05-16 21:40:31 +00:00
|
|
|
# We need the filegroup here because the raw list causes Bazel
|
|
|
|
|
# to see duplicate files. It knows how to deduplicate with the
|
|
|
|
|
# filegroup.
|
2023-03-26 10:27:18 +00:00
|
|
|
":cpp_generated_code",
|
2020-04-07 05:48:33 +00:00
|
|
|
],
|
|
|
|
|
includes = [
|
|
|
|
|
"torch/csrc/api/include",
|
2021-06-24 19:37:29 +00:00
|
|
|
"torch/csrc/distributed",
|
2020-04-07 05:48:33 +00:00
|
|
|
"torch/lib",
|
|
|
|
|
"torch/lib/libshm",
|
|
|
|
|
],
|
|
|
|
|
visibility = ["//visibility:public"],
|
|
|
|
|
deps = [
|
2021-12-17 21:41:24 +00:00
|
|
|
":torch_headers",
|
2020-04-07 05:48:33 +00:00
|
|
|
],
|
|
|
|
|
)
|
|
|
|
|
|
2020-05-20 05:49:28 +00:00
|
|
|
cc_library(
|
|
|
|
|
name = "torch_python",
|
2023-05-12 19:43:56 +00:00
|
|
|
srcs = libtorch_python_core_sources
|
|
|
|
|
+ if_cuda(libtorch_python_cuda_sources)
|
|
|
|
|
+ if_cuda(libtorch_python_distributed_sources)
|
|
|
|
|
+ GENERATED_AUTOGRAD_PYTHON,
|
|
|
|
|
hdrs = glob([
|
|
|
|
|
"torch/csrc/generic/*.cpp",
|
|
|
|
|
]),
|
|
|
|
|
copts = COMMON_COPTS + if_cuda(["-DUSE_CUDA=1"]),
|
2020-05-20 05:49:28 +00:00
|
|
|
deps = [
|
2023-03-26 10:27:18 +00:00
|
|
|
":torch",
|
2023-05-12 19:43:56 +00:00
|
|
|
":shm",
|
2021-12-17 21:41:24 +00:00
|
|
|
"@pybind11",
|
2020-05-20 05:49:28 +00:00
|
|
|
],
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
pybind_extension(
|
2023-05-12 19:43:56 +00:00
|
|
|
name = "torch/_C",
|
2020-06-02 20:07:55 +00:00
|
|
|
srcs = ["torch/csrc/stub.c"],
|
2020-05-20 05:49:28 +00:00
|
|
|
deps = [
|
2023-03-26 10:27:18 +00:00
|
|
|
":torch_python",
|
2023-05-12 19:43:56 +00:00
|
|
|
":aten_nvrtc",
|
|
|
|
|
],
|
|
|
|
|
)
|
|
|
|
|
|
2023-05-18 20:29:03 +00:00
|
|
|
cc_library(
|
|
|
|
|
name = "functorch",
|
|
|
|
|
hdrs = glob([
|
|
|
|
|
"functorch/csrc/dim/*.h",
|
|
|
|
|
]),
|
|
|
|
|
srcs = glob([
|
|
|
|
|
"functorch/csrc/dim/*.cpp",
|
|
|
|
|
]),
|
|
|
|
|
deps = [
|
|
|
|
|
":aten_nvrtc",
|
|
|
|
|
":torch_python",
|
|
|
|
|
"@pybind11",
|
|
|
|
|
],
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
pybind_extension(
|
|
|
|
|
name = "functorch/_C",
|
|
|
|
|
copts=[
|
|
|
|
|
"-DTORCH_EXTENSION_NAME=_C"
|
|
|
|
|
],
|
|
|
|
|
srcs = [
|
|
|
|
|
"functorch/csrc/init_dim_only.cpp",
|
|
|
|
|
],
|
|
|
|
|
deps = [
|
|
|
|
|
":functorch",
|
|
|
|
|
":torch_python",
|
|
|
|
|
":aten_nvrtc",
|
|
|
|
|
],
|
|
|
|
|
)
|
|
|
|
|
|
2023-05-12 19:43:56 +00:00
|
|
|
cc_binary(
|
|
|
|
|
name = "torch/bin/torch_shm_manager",
|
|
|
|
|
srcs = [
|
|
|
|
|
"torch/lib/libshm/manager.cpp",
|
|
|
|
|
],
|
|
|
|
|
deps = [
|
|
|
|
|
":shm",
|
|
|
|
|
],
|
|
|
|
|
linkstatic = False,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
template_rule(
|
|
|
|
|
name = "gen_version_py",
|
|
|
|
|
src = ":torch/version.py.tpl",
|
|
|
|
|
out = "torch/version.py",
|
|
|
|
|
substitutions = if_cuda({
|
|
|
|
|
# Set default to 11.2. Otherwise Torchvision complains about incompatibility.
|
|
|
|
|
"{{CUDA_VERSION}}": "11.2",
|
|
|
|
|
"{{VERSION}}": "2.0.0",
|
|
|
|
|
}, {
|
|
|
|
|
"{{CUDA_VERSION}}": "None",
|
|
|
|
|
"{{VERSION}}": "2.0.0",
|
|
|
|
|
}),
|
|
|
|
|
)
|
|
|
|
|
|
2023-05-23 06:20:33 +00:00
|
|
|
py_library(
|
2023-05-12 19:43:56 +00:00
|
|
|
name = "pytorch_py",
|
|
|
|
|
visibility = ["//visibility:public"],
|
2023-05-18 20:29:03 +00:00
|
|
|
srcs = glob(["torch/**/*.py"], exclude = ["torch/version.py"]) + [":torch/version.py"] + glob(["functorch/**/*.py"]),
|
2023-05-12 19:43:56 +00:00
|
|
|
deps = [
|
|
|
|
|
rules.requirement("numpy"),
|
|
|
|
|
rules.requirement("pyyaml"),
|
|
|
|
|
rules.requirement("requests"),
|
|
|
|
|
rules.requirement("setuptools"),
|
2023-05-23 06:20:33 +00:00
|
|
|
rules.requirement("sympy"),
|
2023-05-12 19:43:56 +00:00
|
|
|
rules.requirement("typing_extensions"),
|
|
|
|
|
"//torchgen",
|
|
|
|
|
],
|
|
|
|
|
data = [
|
|
|
|
|
":torch/_C.so",
|
2023-05-18 20:29:03 +00:00
|
|
|
":functorch/_C.so",
|
2023-05-12 19:43:56 +00:00
|
|
|
":torch/bin/torch_shm_manager",
|
2020-05-20 05:49:28 +00:00
|
|
|
],
|
|
|
|
|
)
|
|
|
|
|
|
2020-04-07 05:48:33 +00:00
|
|
|
# cpp api tests
|
|
|
|
|
cc_library(
|
|
|
|
|
name = "test_support",
|
|
|
|
|
testonly = True,
|
|
|
|
|
srcs = [
|
|
|
|
|
"test/cpp/api/support.cpp",
|
|
|
|
|
],
|
|
|
|
|
hdrs = [
|
2020-05-19 05:00:55 +00:00
|
|
|
"test/cpp/api/init_baseline.h",
|
|
|
|
|
"test/cpp/api/optim_baseline.h",
|
2020-04-07 05:48:33 +00:00
|
|
|
"test/cpp/api/support.h",
|
|
|
|
|
"test/cpp/common/support.h",
|
|
|
|
|
],
|
|
|
|
|
deps = [
|
|
|
|
|
":torch",
|
|
|
|
|
"@com_google_googletest//:gtest_main",
|
|
|
|
|
],
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# Torch integration tests rely on a labeled data set from the MNIST database.
|
|
|
|
|
# http://yann.lecun.com/exdb/mnist/
|
|
|
|
|
|
2021-12-17 21:41:24 +00:00
|
|
|
cpp_api_tests = glob(
|
|
|
|
|
["test/cpp/api/*.cpp"],
|
2022-09-06 18:14:08 +00:00
|
|
|
exclude = [
|
|
|
|
|
"test/cpp/api/imethod.cpp",
|
|
|
|
|
"test/cpp/api/integration.cpp",
|
|
|
|
|
],
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
cc_test(
|
2023-03-26 10:27:18 +00:00
|
|
|
name = "integration_test",
|
|
|
|
|
size = "medium",
|
|
|
|
|
srcs = ["test/cpp/api/integration.cpp"],
|
|
|
|
|
data = [
|
2022-09-06 18:14:08 +00:00
|
|
|
":download_mnist",
|
2023-03-26 10:27:18 +00:00
|
|
|
],
|
|
|
|
|
tags = [
|
|
|
|
|
"gpu-required",
|
|
|
|
|
],
|
|
|
|
|
deps = [
|
|
|
|
|
":test_support",
|
|
|
|
|
"@com_google_googletest//:gtest_main",
|
|
|
|
|
],
|
2021-12-17 21:41:24 +00:00
|
|
|
)
|
|
|
|
|
|
2020-05-19 05:00:55 +00:00
|
|
|
[
|
2023-03-26 10:27:18 +00:00
|
|
|
cc_test(
|
|
|
|
|
name = paths.split_extension(paths.basename(filename))[0].replace("-", "_") + "_test",
|
|
|
|
|
size = "medium",
|
|
|
|
|
srcs = [filename],
|
|
|
|
|
deps = [
|
|
|
|
|
":test_support",
|
|
|
|
|
"@com_google_googletest//:gtest_main",
|
|
|
|
|
],
|
|
|
|
|
)
|
|
|
|
|
for filename in cpp_api_tests
|
2020-05-19 05:00:55 +00:00
|
|
|
]
|
2020-04-07 05:48:33 +00:00
|
|
|
|
|
|
|
|
test_suite(
|
|
|
|
|
name = "api_tests",
|
|
|
|
|
tests = [
|
|
|
|
|
"any_test",
|
|
|
|
|
"autograd_test",
|
|
|
|
|
"dataloader_test",
|
|
|
|
|
"enum_test",
|
|
|
|
|
"expanding_array_test",
|
|
|
|
|
"functional_test",
|
|
|
|
|
"init_test",
|
|
|
|
|
"integration_test",
|
|
|
|
|
"jit_test",
|
|
|
|
|
"memory_test",
|
|
|
|
|
"misc_test",
|
|
|
|
|
"module_test",
|
|
|
|
|
"modulelist_test",
|
|
|
|
|
"modules_test",
|
|
|
|
|
"nn_utils_test",
|
|
|
|
|
"optim_test",
|
|
|
|
|
"ordered_dict_test",
|
|
|
|
|
"rnn_test",
|
|
|
|
|
"sequential_test",
|
|
|
|
|
"serialize_test",
|
|
|
|
|
"static_test",
|
|
|
|
|
"tensor_options_test",
|
|
|
|
|
"tensor_test",
|
|
|
|
|
"torch_include_test",
|
|
|
|
|
],
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# dist autograd tests
|
|
|
|
|
cc_test(
|
|
|
|
|
name = "torch_dist_autograd_test",
|
|
|
|
|
size = "small",
|
|
|
|
|
srcs = ["test/cpp/dist_autograd/test_dist_autograd.cpp"],
|
|
|
|
|
tags = [
|
|
|
|
|
"exclusive",
|
|
|
|
|
"gpu-required",
|
|
|
|
|
],
|
|
|
|
|
deps = [
|
|
|
|
|
":torch",
|
|
|
|
|
"@com_google_googletest//:gtest_main",
|
|
|
|
|
],
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# jit tests
|
|
|
|
|
# Because these individual unit tests require custom registering,
|
|
|
|
|
# it is easier to mimic the cmake build by globing together a single test.
|
|
|
|
|
cc_test(
|
|
|
|
|
name = "jit_tests",
|
|
|
|
|
size = "small",
|
2023-03-26 10:27:18 +00:00
|
|
|
srcs = glob(
|
|
|
|
|
[
|
|
|
|
|
"test/cpp/jit/*.cpp",
|
|
|
|
|
"test/cpp/jit/*.h",
|
|
|
|
|
"test/cpp/tensorexpr/*.cpp",
|
|
|
|
|
"test/cpp/tensorexpr/*.h",
|
|
|
|
|
],
|
|
|
|
|
exclude = [
|
|
|
|
|
# skip this since <pybind11/embed.h> is not found in OSS build
|
|
|
|
|
"test/cpp/jit/test_exception.cpp",
|
|
|
|
|
],
|
|
|
|
|
),
|
2020-04-07 05:48:33 +00:00
|
|
|
linkstatic = True,
|
|
|
|
|
tags = [
|
|
|
|
|
"exclusive",
|
|
|
|
|
"gpu-required",
|
|
|
|
|
],
|
|
|
|
|
deps = [
|
|
|
|
|
":torch",
|
|
|
|
|
"@com_google_googletest//:gtest_main",
|
|
|
|
|
],
|
|
|
|
|
)
|
|
|
|
|
|
2021-10-07 17:03:02 +00:00
|
|
|
cc_test(
|
|
|
|
|
name = "lazy_tests",
|
|
|
|
|
size = "small",
|
2023-03-26 10:27:18 +00:00
|
|
|
srcs = glob(
|
|
|
|
|
[
|
|
|
|
|
"test/cpp/lazy/*.cpp",
|
|
|
|
|
"test/cpp/lazy/*.h",
|
|
|
|
|
],
|
|
|
|
|
exclude = [
|
|
|
|
|
# skip these since they depend on generated LazyIr.h which isn't available in bazel yet
|
|
|
|
|
"test/cpp/lazy/test_ir.cpp",
|
|
|
|
|
"test/cpp/lazy/test_lazy_ops.cpp",
|
|
|
|
|
"test/cpp/lazy/test_lazy_ops_util.cpp",
|
|
|
|
|
],
|
|
|
|
|
),
|
2021-10-07 17:03:02 +00:00
|
|
|
linkstatic = True,
|
|
|
|
|
tags = [
|
|
|
|
|
"exclusive",
|
|
|
|
|
],
|
|
|
|
|
deps = [
|
|
|
|
|
":torch",
|
|
|
|
|
"@com_google_googletest//:gtest_main",
|
|
|
|
|
],
|
|
|
|
|
)
|
|
|
|
|
|
2023-05-12 19:43:56 +00:00
|
|
|
# python api tests
|
|
|
|
|
|
|
|
|
|
py_test(
|
|
|
|
|
name = "test_bazel",
|
2023-05-18 20:29:03 +00:00
|
|
|
srcs = ["test/_test_bazel.py"],
|
|
|
|
|
main = "test/_test_bazel.py",
|
2023-05-12 19:43:56 +00:00
|
|
|
deps = [":pytorch_py"],
|
|
|
|
|
)
|
|
|
|
|
|
2020-04-07 05:48:33 +00:00
|
|
|
# all tests
|
|
|
|
|
test_suite(
|
|
|
|
|
name = "all_tests",
|
|
|
|
|
tests = [
|
|
|
|
|
"api_tests",
|
|
|
|
|
"jit_tests",
|
|
|
|
|
"torch_dist_autograd_test",
|
2022-02-03 20:07:01 +00:00
|
|
|
"//c10/test:tests",
|
2020-04-07 05:48:33 +00:00
|
|
|
],
|
|
|
|
|
)
|
2022-05-03 09:48:27 +00:00
|
|
|
|
|
|
|
|
# An internal genrule that we are converging with refers to these file
|
|
|
|
|
# as if they are from this package, so we alias them for
|
|
|
|
|
# compatibility.
|
|
|
|
|
|
2022-05-05 23:31:35 +00:00
|
|
|
[
|
|
|
|
|
alias(
|
|
|
|
|
name = paths.basename(path),
|
|
|
|
|
actual = path,
|
|
|
|
|
)
|
|
|
|
|
for path in [
|
|
|
|
|
"aten/src/ATen/templates/DispatchKeyNativeFunctions.cpp",
|
|
|
|
|
"aten/src/ATen/templates/DispatchKeyNativeFunctions.h",
|
|
|
|
|
"aten/src/ATen/templates/LazyIr.h",
|
2022-05-24 19:29:23 +00:00
|
|
|
"aten/src/ATen/templates/LazyNonNativeIr.h",
|
2022-05-05 23:31:35 +00:00
|
|
|
"aten/src/ATen/templates/RegisterDispatchKey.cpp",
|
[torchgen] Generate wrapper functions under custom namespaces (#81744)
Summary:
A follow up of #81581. Before these 2 PRs, if an operator with custom kernel namespace is added to `native_functions.yaml` (or any other yaml consumed by `torchgen`), although we are able to recognize the custom kernel in files such as `NativeFunctions.h` and `RegisterCPU.cpp`, we still generate backend specific wrappers under the hardcoded `at` namespace. This changes the behavior, by generating wrapper functions under custom namespaces.
For example, if the entries in yaml file looks like:
```
- func: op_1(Tensor(a) self) -> Tensor(a)
dispatch:
CPU: at::op_1_kernel # ATen kernel
- func: op_2(Tensor(a) self) -> Tensor(a)
dispatch:
CPU: custom::op_2_kernel # custom kernel
```
We generate the following code for `CPUFunctions_inl.h` and `RegisterCPU.cpp`:
`CPUFunctions_inl.h`:
```
namespace at {
namespace cpu {
TORCH_API at::Tensor & op_1(const at::Tensor & self);
} // namespace cpu
} // namespace at
namespace custom {
namespace cpu {
TORCH_API at::Tensor & op_2(const at::Tensor & self);
} // namespace cpu
} // namespace custom
```
Notice the difference between `at::cpu` and `custom::cpu`.
Then the definition for these can be found in `RegisterCPU.cpp`.
`RegisterCPU.cpp`:
```
#include "CPUFunctions.h"
namespace at {
namespace {
at::Tensor & wrapper_op_1(const at::Tensor & self) {
// No device check
// DeviceGuard omitted
return at::native::op_1_kernel(self);
}
} // anonymous namespace
TORCH_LIBRARY_IMPL(aten, CPU, m) {
m.impl("op_1", TORCH_FN(wrapper_op_1));
}
namespace cpu {
at::Tensor & op_1(at::Tensor & self) {
return wrapper_op_1(self);
}
} // namespace cpu
} // namespace at
namespace custom {
namespace {
at::Tensor & wrapper_op_2(const at::Tensor & self) {
// No device check
// DeviceGuard omitted
return at::native::op_2_kernel(self);
}
} // anonymous namespace
TORCH_LIBRARY_IMPL(aten, CPU, m) {
m.impl("op_2", TORCH_FN(wrapper_op_2));
}
namespace cpu {
at::Tensor & op_2(at::Tensor & self) {
return wrapper_op_2(self);
}
} // namespace cpu
} // namespace custom
```
The benefit for this change is that it unifies all the namespaces derived from custom ops. In the example above, there are:
1. `custom::native` for kernels
2. `custom::<dispatch_key>` e.g., `custom::cpu` for wrappers
This customized operator will have nothing to do with `at::native`, `at::cpu` etc.
Test Plan: This is very hard to test. I will refactor this logic, abstract out some layers so it's testable. Will do it in coming PRs
Differential Revision: D37972772
Pull Request resolved: https://github.com/pytorch/pytorch/pull/81744
Approved by: https://github.com/bdhirsh
2022-08-04 07:48:44 +00:00
|
|
|
"aten/src/ATen/templates/RegisterDispatchDefinitions.ini",
|
2022-05-05 23:31:35 +00:00
|
|
|
"aten/src/ATen/native/native_functions.yaml",
|
|
|
|
|
"aten/src/ATen/native/tags.yaml",
|
|
|
|
|
"aten/src/ATen/native/ts_native_functions.yaml",
|
|
|
|
|
"torch/csrc/lazy/core/shape_inference.h",
|
|
|
|
|
"torch/csrc/lazy/ts_backend/ts_native_functions.cpp",
|
|
|
|
|
]
|
|
|
|
|
]
|
2022-09-06 18:14:08 +00:00
|
|
|
|
|
|
|
|
genrule(
|
|
|
|
|
name = "download_mnist",
|
|
|
|
|
srcs = ["//:tools/download_mnist.py"],
|
|
|
|
|
outs = [
|
|
|
|
|
"mnist/train-images-idx3-ubyte",
|
|
|
|
|
"mnist/train-labels-idx1-ubyte",
|
|
|
|
|
"mnist/t10k-images-idx3-ubyte",
|
|
|
|
|
"mnist/t10k-labels-idx1-ubyte",
|
|
|
|
|
],
|
|
|
|
|
cmd = "python3 tools/download_mnist.py -d $(RULEDIR)/mnist",
|
|
|
|
|
)
|