Canonicalize all includes in PyTorch. (#14849)
Summary:
Anywhere we used #include "foo.h", we now say #include <foo.h>
Paths are adjusted to be rooted out of aten/src, torch/lib, or
the root level directory.
I modified CMakeLists.txt by hand to remove TH and THC from
the include paths.
I used the following script to do the canonicalization:
```
import subprocess
import re
import os.path
files = subprocess.check_output(['git', 'ls-files']).decode('utf-8').rstrip().split('\n')
for fn in files:
if not any(fn.endswith(suff) for suff in ['.cu', '.cpp', '.in', '.h', '.hpp', '.cu', '.cuh', '.cc']):
continue
if not any(fn.startswith(pref) for pref in ["aten/", "torch/"]):
continue
with open(fn, 'r') as f:
c = f.read()
def fmt(p):
return "#include <{}>".format(p)
def repl(m):
p = m.group(1)
if p in ["dlfcn.h", "unistd.h", "nvrtc.h", "cuda.h", "cuda_runtime.h", "cstdint", "cudnn.h", "Python.h", "cusparse.h", "cuda_runtime_api.h", "cuda_fp16.h", "cublas_v2.h", "stdint.h", "curand_kernel.h"]:
return fmt(p)
if any(p.startswith(pref) for pref in ["torch/csrc", "c10/", "ATen/", "caffe2/", "TH/", "THC/", "Eigen/", "gtest/", "zdl/", "gloo/", "onnx/", "miopen/"]):
return fmt(p)
for root in ["aten/src", "torch/lib", ""]:
for bad_root in [os.path.dirname(fn), "aten/src/TH", "aten/src/THC", "torch/csrc"]:
new_p = os.path.relpath(os.path.join(bad_root, p), root)
if not new_p.startswith("../") and (os.path.exists(os.path.join(root, new_p)) or os.path.exists(os.path.join(root, new_p + ".in"))):
return fmt(new_p)
print("ERROR: ", fn, p)
return m.group(0)
new_c = re.sub(r'#include "([^"]+)"', repl, c)
if new_c != c:
print(fn)
with open(fn, 'w') as f:
f.write(new_c)
```
Signed-off-by: Edward Z. Yang <ezyang@fb.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/14849
Reviewed By: dzhulgakov
Differential Revision: D13363445
Pulled By: ezyang
fbshipit-source-id: 52361f878a672785f9306c9e9ab2513128092b68
2018-12-09 03:32:01 +00:00
|
|
|
#include <torch/csrc/tensor/python_tensor.h>
|
2018-02-23 23:03:31 +00:00
|
|
|
|
|
|
|
|
#include <structmember.h>
|
|
|
|
|
#include <pybind11/pybind11.h>
|
|
|
|
|
|
Canonicalize all includes in PyTorch. (#14849)
Summary:
Anywhere we used #include "foo.h", we now say #include <foo.h>
Paths are adjusted to be rooted out of aten/src, torch/lib, or
the root level directory.
I modified CMakeLists.txt by hand to remove TH and THC from
the include paths.
I used the following script to do the canonicalization:
```
import subprocess
import re
import os.path
files = subprocess.check_output(['git', 'ls-files']).decode('utf-8').rstrip().split('\n')
for fn in files:
if not any(fn.endswith(suff) for suff in ['.cu', '.cpp', '.in', '.h', '.hpp', '.cu', '.cuh', '.cc']):
continue
if not any(fn.startswith(pref) for pref in ["aten/", "torch/"]):
continue
with open(fn, 'r') as f:
c = f.read()
def fmt(p):
return "#include <{}>".format(p)
def repl(m):
p = m.group(1)
if p in ["dlfcn.h", "unistd.h", "nvrtc.h", "cuda.h", "cuda_runtime.h", "cstdint", "cudnn.h", "Python.h", "cusparse.h", "cuda_runtime_api.h", "cuda_fp16.h", "cublas_v2.h", "stdint.h", "curand_kernel.h"]:
return fmt(p)
if any(p.startswith(pref) for pref in ["torch/csrc", "c10/", "ATen/", "caffe2/", "TH/", "THC/", "Eigen/", "gtest/", "zdl/", "gloo/", "onnx/", "miopen/"]):
return fmt(p)
for root in ["aten/src", "torch/lib", ""]:
for bad_root in [os.path.dirname(fn), "aten/src/TH", "aten/src/THC", "torch/csrc"]:
new_p = os.path.relpath(os.path.join(bad_root, p), root)
if not new_p.startswith("../") and (os.path.exists(os.path.join(root, new_p)) or os.path.exists(os.path.join(root, new_p + ".in"))):
return fmt(new_p)
print("ERROR: ", fn, p)
return m.group(0)
new_c = re.sub(r'#include "([^"]+)"', repl, c)
if new_c != c:
print(fn)
with open(fn, 'w') as f:
f.write(new_c)
```
Signed-off-by: Edward Z. Yang <ezyang@fb.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/14849
Reviewed By: dzhulgakov
Differential Revision: D13363445
Pulled By: ezyang
fbshipit-source-id: 52361f878a672785f9306c9e9ab2513128092b68
2018-12-09 03:32:01 +00:00
|
|
|
#include <torch/csrc/Dtype.h>
|
|
|
|
|
#include <torch/csrc/DynamicTypes.h>
|
|
|
|
|
#include <torch/csrc/Exceptions.h>
|
|
|
|
|
#include <torch/csrc/Layout.h>
|
|
|
|
|
#include <torch/csrc/autograd/variable.h>
|
|
|
|
|
#include <torch/csrc/autograd/python_variable.h>
|
|
|
|
|
#include <torch/csrc/autograd/generated/VariableType.h>
|
|
|
|
|
#include <torch/csrc/autograd/utils/wrap_outputs.h>
|
|
|
|
|
#include <torch/csrc/utils/cuda_enabled.h>
|
|
|
|
|
#include <torch/csrc/utils/cuda_lazy_init.h>
|
|
|
|
|
#include <torch/csrc/utils/python_strings.h>
|
|
|
|
|
#include <torch/csrc/utils/tensor_new.h>
|
|
|
|
|
#include <torch/csrc/utils/tensor_types.h>
|
2018-02-23 23:03:31 +00:00
|
|
|
|
2018-06-16 07:40:35 +00:00
|
|
|
#include <ATen/ATen.h>
|
|
|
|
|
|
2018-05-04 15:04:57 +00:00
|
|
|
#include <sstream>
|
2018-06-16 07:40:35 +00:00
|
|
|
#include <string>
|
|
|
|
|
#include <type_traits>
|
2018-05-04 15:04:57 +00:00
|
|
|
#include <vector>
|
|
|
|
|
|
2018-06-26 04:11:49 +00:00
|
|
|
namespace torch { namespace tensors {
|
2018-02-23 23:03:31 +00:00
|
|
|
|
|
|
|
|
using namespace at;
|
|
|
|
|
using namespace torch::autograd;
|
|
|
|
|
|
|
|
|
|
struct PyTensorType {
|
|
|
|
|
PyTypeObject py_type;
|
Split libATen.so into libATen_cpu.so and libATen_cuda.so (#7275)
* Split libATen.so into libATen_cpu.so and libATen_cuda.so
Previously, ATen could be built with either CPU-only support, or
CPU/CUDA support, but only via a compile-time flag, requiring
two separate builds. This means that if you have a program which
indirectly uses a CPU-only build of ATen, and a CPU/CUDA-build of
ATen, you're gonna have a bad time. And you might want a CPU-only
build of ATen, because it is 15M (versus the 300M of a CUDA build).
This commit splits libATen.so into two libraries, CPU/CUDA, so
that it's not necessary to do a full rebuild to get CPU-only
support; instead, if you link against libATen_cpu.so only, you
are CPU-only; if you additionally link/dlopen libATen_cuda.so,
this enables CUDA support. This brings ATen's dynamic library
structure more similar to Caffe2's. libATen.so is no more
(this is BC BREAKING)
The general principle for how this works is that we introduce
a *hooks* interface, which introduces a dynamic dispatch indirection
between a call site and implementation site of CUDA functionality,
mediated by a static initialization registry. This means that we can continue
to, for example, lazily initialize CUDA from Context (a core, CPU class) without
having a direct dependency on the CUDA bits. Instead, we look up
in the registry if, e.g., CUDA hooks have been loaded (this loading
process happens at static initialization time), and if they
have been we dynamic dispatch to this class. We similarly use
the hooks interface to handle Variable registration.
We introduce a new invariant: if the backend of a type has not
been initialized (e.g., it's library has not been dlopened; for
CUDA, this also includes CUDA initialization), then the Type
pointers in the context registry are NULL. If you access the
registry directly you must maintain this invariant.
There are a few potholes along the way. I document them here:
- Previously, PyTorch maintained a separate registry for variable
types, because no provision for them was made in the Context's
type_registry. Now that we have the hooks mechanism, we can easily
have PyTorch register variables in the main registry. The code
has been refactored accordingly.
- There is a subtle ordering issue between Variable and CUDA.
We permit libATen_cuda.so and PyTorch to be loaded in either
order (in practice, CUDA is always loaded "after" PyTorch, because
it is lazily initialized.) This means that, when CUDA types are
loaded, we must subsequently also initialize their Variable equivalents.
Appropriate hooks were added to VariableHooks to make this possible;
similarly, getVariableHooks() is not referentially transparent, and
will change behavior after Variables are loaded. (This is different
to CUDAHooks, which is "burned in" after you try to initialize CUDA.)
- The cmake is adjusted to separate dependencies into either CPU
or CUDA dependencies. The generator scripts are adjusted to either
generate a file as a CUDA (cuda_file_manager) or CPU file (file_manager).
- I changed all native functions which were CUDA-only (the cudnn functions)
to have dispatches for CUDA only (making it permissible to not specify
all dispatch options.) This uncovered a bug in how we were handling
native functions which dispatch on a Type argument; I introduced a new
self_ty keyword to handle this case. I'm not 100% happy about it
but it fixed my problem.
This also exposed the fact that set_history incompletely handles
heterogenous return tuples combining Tensor and TensorList. I
swapped this codegen to use flatten() (at the possible cost of
a slight perf regression, since we're allocating another vector now
in this code path).
- thc_state is no longer a public member of Context; use getTHCState() instead
- This PR comes with Registry from Caffe2, for handling static initialization.
I needed to make a bunch of fixes to Registry to make it more portable
- No more ##__VA_ARGS__ token pasting; instead, it is mandatory to pass at
least one argument to the var-args. CUDAHooks and VariableHooks pass a nullary
struct CUDAHooksArgs/VariableHooksArgs to solve the problem. We must get rid of
token pasting because it does not work with MSVC.
- It seems MSVC is not willing to generate code for constructors of template
classes at use sites which cross DLL boundaries. So we explicitly instantiate
the class to get around the problem. This involved tweaks to the boilerplate
generating macros, and also required us to shuffle around namespaces a bit,
because you can't specialize a template unless you are in the same namespace as
the template.
- Insertion of AT_API to appropriate places where the registry must be exported
- We have a general problem which is that on recent Ubuntu distributions,
--as-needed is enabled for shared libraries, which is (cc @apaszke who was
worrying about this in #7160 see also #7160 (comment)). For now, I've hacked
this up in the PR to pass -Wl,--no-as-needed to all of the spots necessary to
make CI work, but a more sustainable solution is to attempt to dlopen
libATen_cuda.so when CUDA functionality is requested.
- The JIT tests somehow manage to try to touch CUDA without loading libATen_cuda.so. So
we pass -Wl,--no-as-needed when linking libATen_cuda.so to _C.so
- There is a very subtle linking issue with lapack, which is solved by making sure libATen_cuda.so links against LAPACK. There's a comment in aten/src/ATen/CMakeLists.txt about htis as well as a follow up bug at #7353
- autogradpp used AT_CUDA_ENABLED directly. We've expunged these uses and added
a few more things to CUDAHooks (getNumGPUs)
- Added manualSeedAll to Generator so that we can invoke it polymorphically (it
only does something different for CUDAGenerator)
- There's a new cuda/CUDAConfig.h header for CUDA-only ifdef macros (AT_CUDNN_ENABLED, most prominently)
- CUDAHooks/VariableHooks structs live in at namespace because Registry's
namespace support is not good enough to handle it otherwise (see Registry
changes above)
- There's some modest moving around of native functions in ReduceOps and
UnaryOps to get the CUDA-only function implementations into separate files, so
they are only compiled into libATen_cuda.so. sspaddmm needed a separate CUDA
function due to object linkage boundaries.
- Some direct uses of native functions in CUDA code has to go away, since these
functions are not exported, so you have to go through the dispatcher
(at::native::empty_like to at::empty_like)
- Code in THC/THCS/THCUNN now properly use THC_API macro instead of TH_API
(which matters now that TH and THC are not in the same library)
- Added code debt in torch/_thnn/utils.py and other THNN parsing code to handle
both TH_API and THC_API
- TensorUtils.h is now properly exported with AT_API
- Dead uses of TH_EXPORTS and co expunged; we now use ATen_cpu_exports and
ATen_cuda_exports (new, in ATenCUDAGeneral.h) consistently
- Fix some incorrect type annotations on _cudnn_rnn_backward, where we didn't
declare a type as possibly undefined when we should have. We didn't catch this
previously because optional annotations are not tested on "pass-through" native
ATen ops (which don't have dispatch). Upstream issue at #7316
- There's a new cmake macro aten_compile_options for applying all of our
per-target compile time options. We use this on the cpu and cuda libraries.
- test/test_cpp_extensions.py can be run directly by invoking in Python,
assuming you've setup your PYTHONPATH setup correctly
- type_from_string does some new funny business to only query for all valid CUDA
types (which causes CUDA initialization) when we see "torch.cuda." in the
requested string
Signed-off-by: Edward Z. Yang <ezyang@fb.com>
* Last mile libtorch fixes
Signed-off-by: Edward Z. Yang <ezyang@fb.com>
* pedantic fix
Signed-off-by: Edward Z. Yang <ezyang@fb.com>
2018-05-10 17:28:33 +00:00
|
|
|
at::Type* aten_type_;
|
Introduce torch.layout and split layout from dtypes. (#6145)
* Introduce torch.layout and split layout from dtypes.
Tensors (and tensor types) now have a 'layout' attribute that returns either 'torch.strided' or 'torch.sparse_coo'.
Previously, dtypes were 1-to-1 with ATen types/PyTensorTypes; the impetus behind this decision was to make things easy in the common case
(i.e. specifying a type in a factory function). But this doesn't really follow for sparity, which isn't a common case.
It also doesn't properly represent the concept or a dtype, which in numpy are proper scalar types (i.e. roughly the type returned from indexing the
last dimension of an n-d array). But this should be the same whether or not the tensor is represented via strides, sparsity, etc.
This is accomplished by:
1) having the dtype of tensor return the (device-type, scalar-type) combination, i.e. torch.cuda.float32, so both
torch.cuda.FloatTensor and torch.cuda.sparse.FloatTensor have the same dtype
2) Adding a layout parameter to python functions, where the combination of (dtype, layout) maps to an ATen type that is used for dispatch.
* Formatting, make init throw python_error.
* Fix cuda not enabled error message.
* Fix test.
2018-04-02 18:07:50 +00:00
|
|
|
THPDtype* dtype;
|
|
|
|
|
THPLayout* layout;
|
2018-04-12 18:05:44 +00:00
|
|
|
bool is_cuda;
|
2018-02-23 23:03:31 +00:00
|
|
|
char name[64];
|
Split libATen.so into libATen_cpu.so and libATen_cuda.so (#7275)
* Split libATen.so into libATen_cpu.so and libATen_cuda.so
Previously, ATen could be built with either CPU-only support, or
CPU/CUDA support, but only via a compile-time flag, requiring
two separate builds. This means that if you have a program which
indirectly uses a CPU-only build of ATen, and a CPU/CUDA-build of
ATen, you're gonna have a bad time. And you might want a CPU-only
build of ATen, because it is 15M (versus the 300M of a CUDA build).
This commit splits libATen.so into two libraries, CPU/CUDA, so
that it's not necessary to do a full rebuild to get CPU-only
support; instead, if you link against libATen_cpu.so only, you
are CPU-only; if you additionally link/dlopen libATen_cuda.so,
this enables CUDA support. This brings ATen's dynamic library
structure more similar to Caffe2's. libATen.so is no more
(this is BC BREAKING)
The general principle for how this works is that we introduce
a *hooks* interface, which introduces a dynamic dispatch indirection
between a call site and implementation site of CUDA functionality,
mediated by a static initialization registry. This means that we can continue
to, for example, lazily initialize CUDA from Context (a core, CPU class) without
having a direct dependency on the CUDA bits. Instead, we look up
in the registry if, e.g., CUDA hooks have been loaded (this loading
process happens at static initialization time), and if they
have been we dynamic dispatch to this class. We similarly use
the hooks interface to handle Variable registration.
We introduce a new invariant: if the backend of a type has not
been initialized (e.g., it's library has not been dlopened; for
CUDA, this also includes CUDA initialization), then the Type
pointers in the context registry are NULL. If you access the
registry directly you must maintain this invariant.
There are a few potholes along the way. I document them here:
- Previously, PyTorch maintained a separate registry for variable
types, because no provision for them was made in the Context's
type_registry. Now that we have the hooks mechanism, we can easily
have PyTorch register variables in the main registry. The code
has been refactored accordingly.
- There is a subtle ordering issue between Variable and CUDA.
We permit libATen_cuda.so and PyTorch to be loaded in either
order (in practice, CUDA is always loaded "after" PyTorch, because
it is lazily initialized.) This means that, when CUDA types are
loaded, we must subsequently also initialize their Variable equivalents.
Appropriate hooks were added to VariableHooks to make this possible;
similarly, getVariableHooks() is not referentially transparent, and
will change behavior after Variables are loaded. (This is different
to CUDAHooks, which is "burned in" after you try to initialize CUDA.)
- The cmake is adjusted to separate dependencies into either CPU
or CUDA dependencies. The generator scripts are adjusted to either
generate a file as a CUDA (cuda_file_manager) or CPU file (file_manager).
- I changed all native functions which were CUDA-only (the cudnn functions)
to have dispatches for CUDA only (making it permissible to not specify
all dispatch options.) This uncovered a bug in how we were handling
native functions which dispatch on a Type argument; I introduced a new
self_ty keyword to handle this case. I'm not 100% happy about it
but it fixed my problem.
This also exposed the fact that set_history incompletely handles
heterogenous return tuples combining Tensor and TensorList. I
swapped this codegen to use flatten() (at the possible cost of
a slight perf regression, since we're allocating another vector now
in this code path).
- thc_state is no longer a public member of Context; use getTHCState() instead
- This PR comes with Registry from Caffe2, for handling static initialization.
I needed to make a bunch of fixes to Registry to make it more portable
- No more ##__VA_ARGS__ token pasting; instead, it is mandatory to pass at
least one argument to the var-args. CUDAHooks and VariableHooks pass a nullary
struct CUDAHooksArgs/VariableHooksArgs to solve the problem. We must get rid of
token pasting because it does not work with MSVC.
- It seems MSVC is not willing to generate code for constructors of template
classes at use sites which cross DLL boundaries. So we explicitly instantiate
the class to get around the problem. This involved tweaks to the boilerplate
generating macros, and also required us to shuffle around namespaces a bit,
because you can't specialize a template unless you are in the same namespace as
the template.
- Insertion of AT_API to appropriate places where the registry must be exported
- We have a general problem which is that on recent Ubuntu distributions,
--as-needed is enabled for shared libraries, which is (cc @apaszke who was
worrying about this in #7160 see also #7160 (comment)). For now, I've hacked
this up in the PR to pass -Wl,--no-as-needed to all of the spots necessary to
make CI work, but a more sustainable solution is to attempt to dlopen
libATen_cuda.so when CUDA functionality is requested.
- The JIT tests somehow manage to try to touch CUDA without loading libATen_cuda.so. So
we pass -Wl,--no-as-needed when linking libATen_cuda.so to _C.so
- There is a very subtle linking issue with lapack, which is solved by making sure libATen_cuda.so links against LAPACK. There's a comment in aten/src/ATen/CMakeLists.txt about htis as well as a follow up bug at #7353
- autogradpp used AT_CUDA_ENABLED directly. We've expunged these uses and added
a few more things to CUDAHooks (getNumGPUs)
- Added manualSeedAll to Generator so that we can invoke it polymorphically (it
only does something different for CUDAGenerator)
- There's a new cuda/CUDAConfig.h header for CUDA-only ifdef macros (AT_CUDNN_ENABLED, most prominently)
- CUDAHooks/VariableHooks structs live in at namespace because Registry's
namespace support is not good enough to handle it otherwise (see Registry
changes above)
- There's some modest moving around of native functions in ReduceOps and
UnaryOps to get the CUDA-only function implementations into separate files, so
they are only compiled into libATen_cuda.so. sspaddmm needed a separate CUDA
function due to object linkage boundaries.
- Some direct uses of native functions in CUDA code has to go away, since these
functions are not exported, so you have to go through the dispatcher
(at::native::empty_like to at::empty_like)
- Code in THC/THCS/THCUNN now properly use THC_API macro instead of TH_API
(which matters now that TH and THC are not in the same library)
- Added code debt in torch/_thnn/utils.py and other THNN parsing code to handle
both TH_API and THC_API
- TensorUtils.h is now properly exported with AT_API
- Dead uses of TH_EXPORTS and co expunged; we now use ATen_cpu_exports and
ATen_cuda_exports (new, in ATenCUDAGeneral.h) consistently
- Fix some incorrect type annotations on _cudnn_rnn_backward, where we didn't
declare a type as possibly undefined when we should have. We didn't catch this
previously because optional annotations are not tested on "pass-through" native
ATen ops (which don't have dispatch). Upstream issue at #7316
- There's a new cmake macro aten_compile_options for applying all of our
per-target compile time options. We use this on the cpu and cuda libraries.
- test/test_cpp_extensions.py can be run directly by invoking in Python,
assuming you've setup your PYTHONPATH setup correctly
- type_from_string does some new funny business to only query for all valid CUDA
types (which causes CUDA initialization) when we see "torch.cuda." in the
requested string
Signed-off-by: Edward Z. Yang <ezyang@fb.com>
* Last mile libtorch fixes
Signed-off-by: Edward Z. Yang <ezyang@fb.com>
* pedantic fix
Signed-off-by: Edward Z. Yang <ezyang@fb.com>
2018-05-10 17:28:33 +00:00
|
|
|
int backend;
|
|
|
|
|
int scalar_type;
|
|
|
|
|
|
|
|
|
|
// Precondition: Access to this struct is protected by the GIL
|
|
|
|
|
at::Type* aten_type() {
|
|
|
|
|
if (!aten_type_) {
|
2018-09-14 15:55:39 +00:00
|
|
|
if (is_cuda) {
|
|
|
|
|
torch::utils::cuda_lazy_init();
|
|
|
|
|
}
|
2018-08-31 03:03:02 +00:00
|
|
|
auto* baseType = globalContext().getNonVariableTypeOpt(static_cast<at::Backend>(backend), static_cast<at::ScalarType>(scalar_type));
|
2018-08-31 03:03:14 +00:00
|
|
|
aten_type_ = baseType ? torch::autograd::VariableType::getVariableTypeFromBaseType(*baseType) : nullptr;
|
Split libATen.so into libATen_cpu.so and libATen_cuda.so (#7275)
* Split libATen.so into libATen_cpu.so and libATen_cuda.so
Previously, ATen could be built with either CPU-only support, or
CPU/CUDA support, but only via a compile-time flag, requiring
two separate builds. This means that if you have a program which
indirectly uses a CPU-only build of ATen, and a CPU/CUDA-build of
ATen, you're gonna have a bad time. And you might want a CPU-only
build of ATen, because it is 15M (versus the 300M of a CUDA build).
This commit splits libATen.so into two libraries, CPU/CUDA, so
that it's not necessary to do a full rebuild to get CPU-only
support; instead, if you link against libATen_cpu.so only, you
are CPU-only; if you additionally link/dlopen libATen_cuda.so,
this enables CUDA support. This brings ATen's dynamic library
structure more similar to Caffe2's. libATen.so is no more
(this is BC BREAKING)
The general principle for how this works is that we introduce
a *hooks* interface, which introduces a dynamic dispatch indirection
between a call site and implementation site of CUDA functionality,
mediated by a static initialization registry. This means that we can continue
to, for example, lazily initialize CUDA from Context (a core, CPU class) without
having a direct dependency on the CUDA bits. Instead, we look up
in the registry if, e.g., CUDA hooks have been loaded (this loading
process happens at static initialization time), and if they
have been we dynamic dispatch to this class. We similarly use
the hooks interface to handle Variable registration.
We introduce a new invariant: if the backend of a type has not
been initialized (e.g., it's library has not been dlopened; for
CUDA, this also includes CUDA initialization), then the Type
pointers in the context registry are NULL. If you access the
registry directly you must maintain this invariant.
There are a few potholes along the way. I document them here:
- Previously, PyTorch maintained a separate registry for variable
types, because no provision for them was made in the Context's
type_registry. Now that we have the hooks mechanism, we can easily
have PyTorch register variables in the main registry. The code
has been refactored accordingly.
- There is a subtle ordering issue between Variable and CUDA.
We permit libATen_cuda.so and PyTorch to be loaded in either
order (in practice, CUDA is always loaded "after" PyTorch, because
it is lazily initialized.) This means that, when CUDA types are
loaded, we must subsequently also initialize their Variable equivalents.
Appropriate hooks were added to VariableHooks to make this possible;
similarly, getVariableHooks() is not referentially transparent, and
will change behavior after Variables are loaded. (This is different
to CUDAHooks, which is "burned in" after you try to initialize CUDA.)
- The cmake is adjusted to separate dependencies into either CPU
or CUDA dependencies. The generator scripts are adjusted to either
generate a file as a CUDA (cuda_file_manager) or CPU file (file_manager).
- I changed all native functions which were CUDA-only (the cudnn functions)
to have dispatches for CUDA only (making it permissible to not specify
all dispatch options.) This uncovered a bug in how we were handling
native functions which dispatch on a Type argument; I introduced a new
self_ty keyword to handle this case. I'm not 100% happy about it
but it fixed my problem.
This also exposed the fact that set_history incompletely handles
heterogenous return tuples combining Tensor and TensorList. I
swapped this codegen to use flatten() (at the possible cost of
a slight perf regression, since we're allocating another vector now
in this code path).
- thc_state is no longer a public member of Context; use getTHCState() instead
- This PR comes with Registry from Caffe2, for handling static initialization.
I needed to make a bunch of fixes to Registry to make it more portable
- No more ##__VA_ARGS__ token pasting; instead, it is mandatory to pass at
least one argument to the var-args. CUDAHooks and VariableHooks pass a nullary
struct CUDAHooksArgs/VariableHooksArgs to solve the problem. We must get rid of
token pasting because it does not work with MSVC.
- It seems MSVC is not willing to generate code for constructors of template
classes at use sites which cross DLL boundaries. So we explicitly instantiate
the class to get around the problem. This involved tweaks to the boilerplate
generating macros, and also required us to shuffle around namespaces a bit,
because you can't specialize a template unless you are in the same namespace as
the template.
- Insertion of AT_API to appropriate places where the registry must be exported
- We have a general problem which is that on recent Ubuntu distributions,
--as-needed is enabled for shared libraries, which is (cc @apaszke who was
worrying about this in #7160 see also #7160 (comment)). For now, I've hacked
this up in the PR to pass -Wl,--no-as-needed to all of the spots necessary to
make CI work, but a more sustainable solution is to attempt to dlopen
libATen_cuda.so when CUDA functionality is requested.
- The JIT tests somehow manage to try to touch CUDA without loading libATen_cuda.so. So
we pass -Wl,--no-as-needed when linking libATen_cuda.so to _C.so
- There is a very subtle linking issue with lapack, which is solved by making sure libATen_cuda.so links against LAPACK. There's a comment in aten/src/ATen/CMakeLists.txt about htis as well as a follow up bug at #7353
- autogradpp used AT_CUDA_ENABLED directly. We've expunged these uses and added
a few more things to CUDAHooks (getNumGPUs)
- Added manualSeedAll to Generator so that we can invoke it polymorphically (it
only does something different for CUDAGenerator)
- There's a new cuda/CUDAConfig.h header for CUDA-only ifdef macros (AT_CUDNN_ENABLED, most prominently)
- CUDAHooks/VariableHooks structs live in at namespace because Registry's
namespace support is not good enough to handle it otherwise (see Registry
changes above)
- There's some modest moving around of native functions in ReduceOps and
UnaryOps to get the CUDA-only function implementations into separate files, so
they are only compiled into libATen_cuda.so. sspaddmm needed a separate CUDA
function due to object linkage boundaries.
- Some direct uses of native functions in CUDA code has to go away, since these
functions are not exported, so you have to go through the dispatcher
(at::native::empty_like to at::empty_like)
- Code in THC/THCS/THCUNN now properly use THC_API macro instead of TH_API
(which matters now that TH and THC are not in the same library)
- Added code debt in torch/_thnn/utils.py and other THNN parsing code to handle
both TH_API and THC_API
- TensorUtils.h is now properly exported with AT_API
- Dead uses of TH_EXPORTS and co expunged; we now use ATen_cpu_exports and
ATen_cuda_exports (new, in ATenCUDAGeneral.h) consistently
- Fix some incorrect type annotations on _cudnn_rnn_backward, where we didn't
declare a type as possibly undefined when we should have. We didn't catch this
previously because optional annotations are not tested on "pass-through" native
ATen ops (which don't have dispatch). Upstream issue at #7316
- There's a new cmake macro aten_compile_options for applying all of our
per-target compile time options. We use this on the cpu and cuda libraries.
- test/test_cpp_extensions.py can be run directly by invoking in Python,
assuming you've setup your PYTHONPATH setup correctly
- type_from_string does some new funny business to only query for all valid CUDA
types (which causes CUDA initialization) when we see "torch.cuda." in the
requested string
Signed-off-by: Edward Z. Yang <ezyang@fb.com>
* Last mile libtorch fixes
Signed-off-by: Edward Z. Yang <ezyang@fb.com>
* pedantic fix
Signed-off-by: Edward Z. Yang <ezyang@fb.com>
2018-05-10 17:28:33 +00:00
|
|
|
}
|
|
|
|
|
return aten_type_;
|
|
|
|
|
}
|
2018-02-23 23:03:31 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
|
|
static_assert(std::is_standard_layout<PyTensorType>::value, "PyTensorType must be standard layout");
|
|
|
|
|
|
2018-04-03 20:29:25 +00:00
|
|
|
// This is always an instance of VariableType
|
|
|
|
|
static at::Type* default_tensor_type;
|
2018-02-23 23:03:31 +00:00
|
|
|
|
|
|
|
|
static void py_bind_tensor_types(const std::vector<PyTensorType>& tensor_types);
|
|
|
|
|
|
|
|
|
|
static TypeError unavailable_type(const PyTensorType& type) {
|
Introduce torch.layout and split layout from dtypes. (#6145)
* Introduce torch.layout and split layout from dtypes.
Tensors (and tensor types) now have a 'layout' attribute that returns either 'torch.strided' or 'torch.sparse_coo'.
Previously, dtypes were 1-to-1 with ATen types/PyTensorTypes; the impetus behind this decision was to make things easy in the common case
(i.e. specifying a type in a factory function). But this doesn't really follow for sparity, which isn't a common case.
It also doesn't properly represent the concept or a dtype, which in numpy are proper scalar types (i.e. roughly the type returned from indexing the
last dimension of an n-d array). But this should be the same whether or not the tensor is represented via strides, sparsity, etc.
This is accomplished by:
1) having the dtype of tensor return the (device-type, scalar-type) combination, i.e. torch.cuda.float32, so both
torch.cuda.FloatTensor and torch.cuda.sparse.FloatTensor have the same dtype
2) Adding a layout parameter to python functions, where the combination of (dtype, layout) maps to an ATen type that is used for dispatch.
* Formatting, make init throw python_error.
* Fix cuda not enabled error message.
* Fix test.
2018-04-02 18:07:50 +00:00
|
|
|
const char* cuda_msg = torch::utils::cuda_enabled() ? ". Torch not compiled with CUDA enabled." : "";
|
2018-02-23 23:03:31 +00:00
|
|
|
return TypeError("type %s not available%s", type.name, cuda_msg);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static PyObject* Tensor_new(PyTypeObject *type, PyObject *args, PyObject *kwargs) {
|
|
|
|
|
HANDLE_TH_ERRORS
|
|
|
|
|
auto& tensor_type = *((PyTensorType*)type);
|
Split libATen.so into libATen_cpu.so and libATen_cuda.so (#7275)
* Split libATen.so into libATen_cpu.so and libATen_cuda.so
Previously, ATen could be built with either CPU-only support, or
CPU/CUDA support, but only via a compile-time flag, requiring
two separate builds. This means that if you have a program which
indirectly uses a CPU-only build of ATen, and a CPU/CUDA-build of
ATen, you're gonna have a bad time. And you might want a CPU-only
build of ATen, because it is 15M (versus the 300M of a CUDA build).
This commit splits libATen.so into two libraries, CPU/CUDA, so
that it's not necessary to do a full rebuild to get CPU-only
support; instead, if you link against libATen_cpu.so only, you
are CPU-only; if you additionally link/dlopen libATen_cuda.so,
this enables CUDA support. This brings ATen's dynamic library
structure more similar to Caffe2's. libATen.so is no more
(this is BC BREAKING)
The general principle for how this works is that we introduce
a *hooks* interface, which introduces a dynamic dispatch indirection
between a call site and implementation site of CUDA functionality,
mediated by a static initialization registry. This means that we can continue
to, for example, lazily initialize CUDA from Context (a core, CPU class) without
having a direct dependency on the CUDA bits. Instead, we look up
in the registry if, e.g., CUDA hooks have been loaded (this loading
process happens at static initialization time), and if they
have been we dynamic dispatch to this class. We similarly use
the hooks interface to handle Variable registration.
We introduce a new invariant: if the backend of a type has not
been initialized (e.g., it's library has not been dlopened; for
CUDA, this also includes CUDA initialization), then the Type
pointers in the context registry are NULL. If you access the
registry directly you must maintain this invariant.
There are a few potholes along the way. I document them here:
- Previously, PyTorch maintained a separate registry for variable
types, because no provision for them was made in the Context's
type_registry. Now that we have the hooks mechanism, we can easily
have PyTorch register variables in the main registry. The code
has been refactored accordingly.
- There is a subtle ordering issue between Variable and CUDA.
We permit libATen_cuda.so and PyTorch to be loaded in either
order (in practice, CUDA is always loaded "after" PyTorch, because
it is lazily initialized.) This means that, when CUDA types are
loaded, we must subsequently also initialize their Variable equivalents.
Appropriate hooks were added to VariableHooks to make this possible;
similarly, getVariableHooks() is not referentially transparent, and
will change behavior after Variables are loaded. (This is different
to CUDAHooks, which is "burned in" after you try to initialize CUDA.)
- The cmake is adjusted to separate dependencies into either CPU
or CUDA dependencies. The generator scripts are adjusted to either
generate a file as a CUDA (cuda_file_manager) or CPU file (file_manager).
- I changed all native functions which were CUDA-only (the cudnn functions)
to have dispatches for CUDA only (making it permissible to not specify
all dispatch options.) This uncovered a bug in how we were handling
native functions which dispatch on a Type argument; I introduced a new
self_ty keyword to handle this case. I'm not 100% happy about it
but it fixed my problem.
This also exposed the fact that set_history incompletely handles
heterogenous return tuples combining Tensor and TensorList. I
swapped this codegen to use flatten() (at the possible cost of
a slight perf regression, since we're allocating another vector now
in this code path).
- thc_state is no longer a public member of Context; use getTHCState() instead
- This PR comes with Registry from Caffe2, for handling static initialization.
I needed to make a bunch of fixes to Registry to make it more portable
- No more ##__VA_ARGS__ token pasting; instead, it is mandatory to pass at
least one argument to the var-args. CUDAHooks and VariableHooks pass a nullary
struct CUDAHooksArgs/VariableHooksArgs to solve the problem. We must get rid of
token pasting because it does not work with MSVC.
- It seems MSVC is not willing to generate code for constructors of template
classes at use sites which cross DLL boundaries. So we explicitly instantiate
the class to get around the problem. This involved tweaks to the boilerplate
generating macros, and also required us to shuffle around namespaces a bit,
because you can't specialize a template unless you are in the same namespace as
the template.
- Insertion of AT_API to appropriate places where the registry must be exported
- We have a general problem which is that on recent Ubuntu distributions,
--as-needed is enabled for shared libraries, which is (cc @apaszke who was
worrying about this in #7160 see also #7160 (comment)). For now, I've hacked
this up in the PR to pass -Wl,--no-as-needed to all of the spots necessary to
make CI work, but a more sustainable solution is to attempt to dlopen
libATen_cuda.so when CUDA functionality is requested.
- The JIT tests somehow manage to try to touch CUDA without loading libATen_cuda.so. So
we pass -Wl,--no-as-needed when linking libATen_cuda.so to _C.so
- There is a very subtle linking issue with lapack, which is solved by making sure libATen_cuda.so links against LAPACK. There's a comment in aten/src/ATen/CMakeLists.txt about htis as well as a follow up bug at #7353
- autogradpp used AT_CUDA_ENABLED directly. We've expunged these uses and added
a few more things to CUDAHooks (getNumGPUs)
- Added manualSeedAll to Generator so that we can invoke it polymorphically (it
only does something different for CUDAGenerator)
- There's a new cuda/CUDAConfig.h header for CUDA-only ifdef macros (AT_CUDNN_ENABLED, most prominently)
- CUDAHooks/VariableHooks structs live in at namespace because Registry's
namespace support is not good enough to handle it otherwise (see Registry
changes above)
- There's some modest moving around of native functions in ReduceOps and
UnaryOps to get the CUDA-only function implementations into separate files, so
they are only compiled into libATen_cuda.so. sspaddmm needed a separate CUDA
function due to object linkage boundaries.
- Some direct uses of native functions in CUDA code has to go away, since these
functions are not exported, so you have to go through the dispatcher
(at::native::empty_like to at::empty_like)
- Code in THC/THCS/THCUNN now properly use THC_API macro instead of TH_API
(which matters now that TH and THC are not in the same library)
- Added code debt in torch/_thnn/utils.py and other THNN parsing code to handle
both TH_API and THC_API
- TensorUtils.h is now properly exported with AT_API
- Dead uses of TH_EXPORTS and co expunged; we now use ATen_cpu_exports and
ATen_cuda_exports (new, in ATenCUDAGeneral.h) consistently
- Fix some incorrect type annotations on _cudnn_rnn_backward, where we didn't
declare a type as possibly undefined when we should have. We didn't catch this
previously because optional annotations are not tested on "pass-through" native
ATen ops (which don't have dispatch). Upstream issue at #7316
- There's a new cmake macro aten_compile_options for applying all of our
per-target compile time options. We use this on the cpu and cuda libraries.
- test/test_cpp_extensions.py can be run directly by invoking in Python,
assuming you've setup your PYTHONPATH setup correctly
- type_from_string does some new funny business to only query for all valid CUDA
types (which causes CUDA initialization) when we see "torch.cuda." in the
requested string
Signed-off-by: Edward Z. Yang <ezyang@fb.com>
* Last mile libtorch fixes
Signed-off-by: Edward Z. Yang <ezyang@fb.com>
* pedantic fix
Signed-off-by: Edward Z. Yang <ezyang@fb.com>
2018-05-10 17:28:33 +00:00
|
|
|
auto aten_type = tensor_type.aten_type();
|
|
|
|
|
if (!aten_type) {
|
2018-02-23 23:03:31 +00:00
|
|
|
throw unavailable_type(tensor_type);
|
|
|
|
|
}
|
2019-04-04 09:21:09 +00:00
|
|
|
auto scalar_type = static_cast<ScalarType>(tensor_type.scalar_type);
|
|
|
|
|
return THPVariable_Wrap(torch::utils::legacy_tensor_ctor(*aten_type, scalar_type, args, kwargs));
|
2018-02-23 23:03:31 +00:00
|
|
|
END_HANDLE_TH_ERRORS
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static PyObject* Tensor_instancecheck(PyTensorType* self, PyObject* arg) {
|
|
|
|
|
HANDLE_TH_ERRORS
|
|
|
|
|
if (THPVariable_Check(arg)) {
|
|
|
|
|
auto& var = ((THPVariable*)arg)->cdata;
|
Split libATen.so into libATen_cpu.so and libATen_cuda.so (#7275)
* Split libATen.so into libATen_cpu.so and libATen_cuda.so
Previously, ATen could be built with either CPU-only support, or
CPU/CUDA support, but only via a compile-time flag, requiring
two separate builds. This means that if you have a program which
indirectly uses a CPU-only build of ATen, and a CPU/CUDA-build of
ATen, you're gonna have a bad time. And you might want a CPU-only
build of ATen, because it is 15M (versus the 300M of a CUDA build).
This commit splits libATen.so into two libraries, CPU/CUDA, so
that it's not necessary to do a full rebuild to get CPU-only
support; instead, if you link against libATen_cpu.so only, you
are CPU-only; if you additionally link/dlopen libATen_cuda.so,
this enables CUDA support. This brings ATen's dynamic library
structure more similar to Caffe2's. libATen.so is no more
(this is BC BREAKING)
The general principle for how this works is that we introduce
a *hooks* interface, which introduces a dynamic dispatch indirection
between a call site and implementation site of CUDA functionality,
mediated by a static initialization registry. This means that we can continue
to, for example, lazily initialize CUDA from Context (a core, CPU class) without
having a direct dependency on the CUDA bits. Instead, we look up
in the registry if, e.g., CUDA hooks have been loaded (this loading
process happens at static initialization time), and if they
have been we dynamic dispatch to this class. We similarly use
the hooks interface to handle Variable registration.
We introduce a new invariant: if the backend of a type has not
been initialized (e.g., it's library has not been dlopened; for
CUDA, this also includes CUDA initialization), then the Type
pointers in the context registry are NULL. If you access the
registry directly you must maintain this invariant.
There are a few potholes along the way. I document them here:
- Previously, PyTorch maintained a separate registry for variable
types, because no provision for them was made in the Context's
type_registry. Now that we have the hooks mechanism, we can easily
have PyTorch register variables in the main registry. The code
has been refactored accordingly.
- There is a subtle ordering issue between Variable and CUDA.
We permit libATen_cuda.so and PyTorch to be loaded in either
order (in practice, CUDA is always loaded "after" PyTorch, because
it is lazily initialized.) This means that, when CUDA types are
loaded, we must subsequently also initialize their Variable equivalents.
Appropriate hooks were added to VariableHooks to make this possible;
similarly, getVariableHooks() is not referentially transparent, and
will change behavior after Variables are loaded. (This is different
to CUDAHooks, which is "burned in" after you try to initialize CUDA.)
- The cmake is adjusted to separate dependencies into either CPU
or CUDA dependencies. The generator scripts are adjusted to either
generate a file as a CUDA (cuda_file_manager) or CPU file (file_manager).
- I changed all native functions which were CUDA-only (the cudnn functions)
to have dispatches for CUDA only (making it permissible to not specify
all dispatch options.) This uncovered a bug in how we were handling
native functions which dispatch on a Type argument; I introduced a new
self_ty keyword to handle this case. I'm not 100% happy about it
but it fixed my problem.
This also exposed the fact that set_history incompletely handles
heterogenous return tuples combining Tensor and TensorList. I
swapped this codegen to use flatten() (at the possible cost of
a slight perf regression, since we're allocating another vector now
in this code path).
- thc_state is no longer a public member of Context; use getTHCState() instead
- This PR comes with Registry from Caffe2, for handling static initialization.
I needed to make a bunch of fixes to Registry to make it more portable
- No more ##__VA_ARGS__ token pasting; instead, it is mandatory to pass at
least one argument to the var-args. CUDAHooks and VariableHooks pass a nullary
struct CUDAHooksArgs/VariableHooksArgs to solve the problem. We must get rid of
token pasting because it does not work with MSVC.
- It seems MSVC is not willing to generate code for constructors of template
classes at use sites which cross DLL boundaries. So we explicitly instantiate
the class to get around the problem. This involved tweaks to the boilerplate
generating macros, and also required us to shuffle around namespaces a bit,
because you can't specialize a template unless you are in the same namespace as
the template.
- Insertion of AT_API to appropriate places where the registry must be exported
- We have a general problem which is that on recent Ubuntu distributions,
--as-needed is enabled for shared libraries, which is (cc @apaszke who was
worrying about this in #7160 see also #7160 (comment)). For now, I've hacked
this up in the PR to pass -Wl,--no-as-needed to all of the spots necessary to
make CI work, but a more sustainable solution is to attempt to dlopen
libATen_cuda.so when CUDA functionality is requested.
- The JIT tests somehow manage to try to touch CUDA without loading libATen_cuda.so. So
we pass -Wl,--no-as-needed when linking libATen_cuda.so to _C.so
- There is a very subtle linking issue with lapack, which is solved by making sure libATen_cuda.so links against LAPACK. There's a comment in aten/src/ATen/CMakeLists.txt about htis as well as a follow up bug at #7353
- autogradpp used AT_CUDA_ENABLED directly. We've expunged these uses and added
a few more things to CUDAHooks (getNumGPUs)
- Added manualSeedAll to Generator so that we can invoke it polymorphically (it
only does something different for CUDAGenerator)
- There's a new cuda/CUDAConfig.h header for CUDA-only ifdef macros (AT_CUDNN_ENABLED, most prominently)
- CUDAHooks/VariableHooks structs live in at namespace because Registry's
namespace support is not good enough to handle it otherwise (see Registry
changes above)
- There's some modest moving around of native functions in ReduceOps and
UnaryOps to get the CUDA-only function implementations into separate files, so
they are only compiled into libATen_cuda.so. sspaddmm needed a separate CUDA
function due to object linkage boundaries.
- Some direct uses of native functions in CUDA code has to go away, since these
functions are not exported, so you have to go through the dispatcher
(at::native::empty_like to at::empty_like)
- Code in THC/THCS/THCUNN now properly use THC_API macro instead of TH_API
(which matters now that TH and THC are not in the same library)
- Added code debt in torch/_thnn/utils.py and other THNN parsing code to handle
both TH_API and THC_API
- TensorUtils.h is now properly exported with AT_API
- Dead uses of TH_EXPORTS and co expunged; we now use ATen_cpu_exports and
ATen_cuda_exports (new, in ATenCUDAGeneral.h) consistently
- Fix some incorrect type annotations on _cudnn_rnn_backward, where we didn't
declare a type as possibly undefined when we should have. We didn't catch this
previously because optional annotations are not tested on "pass-through" native
ATen ops (which don't have dispatch). Upstream issue at #7316
- There's a new cmake macro aten_compile_options for applying all of our
per-target compile time options. We use this on the cpu and cuda libraries.
- test/test_cpp_extensions.py can be run directly by invoking in Python,
assuming you've setup your PYTHONPATH setup correctly
- type_from_string does some new funny business to only query for all valid CUDA
types (which causes CUDA initialization) when we see "torch.cuda." in the
requested string
Signed-off-by: Edward Z. Yang <ezyang@fb.com>
* Last mile libtorch fixes
Signed-off-by: Edward Z. Yang <ezyang@fb.com>
* pedantic fix
Signed-off-by: Edward Z. Yang <ezyang@fb.com>
2018-05-10 17:28:33 +00:00
|
|
|
// NB: This is a little unfortunate, in that if I do an isinstance check
|
|
|
|
|
// against torch.cuda.FloatTensor, this will immediately initialize CUDA.
|
|
|
|
|
// I originally thought that it would not be possible for aten_type_ to
|
|
|
|
|
// be nullptr if you had a tensor of some type, in which case you can
|
|
|
|
|
// skip initializign aten_type(), but TestAutograd.test_type_conversions
|
|
|
|
|
// seems to violate this property (for whatever reason.)
|
2019-04-22 04:12:21 +00:00
|
|
|
if (&var.dispatch_type() == self->aten_type() &&
|
|
|
|
|
var.scalar_type() == static_cast<ScalarType>(self->scalar_type)) {
|
2018-02-23 23:03:31 +00:00
|
|
|
Py_RETURN_TRUE;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
Py_RETURN_FALSE;
|
|
|
|
|
END_HANDLE_TH_ERRORS
|
|
|
|
|
}
|
|
|
|
|
|
2018-03-01 19:06:55 +00:00
|
|
|
PyObject *Tensor_dtype(PyTensorType* self) {
|
|
|
|
|
return torch::autograd::utils::wrap(self->dtype);
|
|
|
|
|
}
|
|
|
|
|
|
Introduce torch.layout and split layout from dtypes. (#6145)
* Introduce torch.layout and split layout from dtypes.
Tensors (and tensor types) now have a 'layout' attribute that returns either 'torch.strided' or 'torch.sparse_coo'.
Previously, dtypes were 1-to-1 with ATen types/PyTensorTypes; the impetus behind this decision was to make things easy in the common case
(i.e. specifying a type in a factory function). But this doesn't really follow for sparity, which isn't a common case.
It also doesn't properly represent the concept or a dtype, which in numpy are proper scalar types (i.e. roughly the type returned from indexing the
last dimension of an n-d array). But this should be the same whether or not the tensor is represented via strides, sparsity, etc.
This is accomplished by:
1) having the dtype of tensor return the (device-type, scalar-type) combination, i.e. torch.cuda.float32, so both
torch.cuda.FloatTensor and torch.cuda.sparse.FloatTensor have the same dtype
2) Adding a layout parameter to python functions, where the combination of (dtype, layout) maps to an ATen type that is used for dispatch.
* Formatting, make init throw python_error.
* Fix cuda not enabled error message.
* Fix test.
2018-04-02 18:07:50 +00:00
|
|
|
PyObject *Tensor_layout(PyTensorType* self) {
|
|
|
|
|
return torch::autograd::utils::wrap(self->layout);
|
|
|
|
|
}
|
|
|
|
|
|
2018-03-01 19:06:55 +00:00
|
|
|
PyObject *Tensor_is_cuda(PyTensorType* self) {
|
2018-04-12 18:05:44 +00:00
|
|
|
if (self->is_cuda) {
|
2018-03-01 19:06:55 +00:00
|
|
|
Py_RETURN_TRUE;
|
|
|
|
|
} else {
|
|
|
|
|
Py_RETURN_FALSE;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
PyObject *Tensor_is_sparse(PyTensorType *self) {
|
2018-06-16 07:40:35 +00:00
|
|
|
if (self->layout->layout == at::Layout::Strided) {
|
2018-03-01 19:06:55 +00:00
|
|
|
Py_RETURN_FALSE;
|
2018-06-16 07:40:35 +00:00
|
|
|
} else {
|
|
|
|
|
Py_RETURN_TRUE;
|
2018-03-01 19:06:55 +00:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2018-02-23 23:03:31 +00:00
|
|
|
static struct PyMethodDef metaclass_methods[] = {
|
2018-08-30 23:22:24 +00:00
|
|
|
{"__instancecheck__", (PyCFunction)Tensor_instancecheck, METH_O, nullptr},
|
|
|
|
|
{nullptr}
|
2018-02-23 23:03:31 +00:00
|
|
|
};
|
|
|
|
|
|
2018-03-01 19:06:55 +00:00
|
|
|
typedef PyObject *(*getter)(PyObject *, void *);
|
|
|
|
|
|
|
|
|
|
static struct PyGetSetDef metaclass_properties[] = {
|
|
|
|
|
{"dtype", (getter)Tensor_dtype, nullptr, nullptr, nullptr},
|
Introduce torch.layout and split layout from dtypes. (#6145)
* Introduce torch.layout and split layout from dtypes.
Tensors (and tensor types) now have a 'layout' attribute that returns either 'torch.strided' or 'torch.sparse_coo'.
Previously, dtypes were 1-to-1 with ATen types/PyTensorTypes; the impetus behind this decision was to make things easy in the common case
(i.e. specifying a type in a factory function). But this doesn't really follow for sparity, which isn't a common case.
It also doesn't properly represent the concept or a dtype, which in numpy are proper scalar types (i.e. roughly the type returned from indexing the
last dimension of an n-d array). But this should be the same whether or not the tensor is represented via strides, sparsity, etc.
This is accomplished by:
1) having the dtype of tensor return the (device-type, scalar-type) combination, i.e. torch.cuda.float32, so both
torch.cuda.FloatTensor and torch.cuda.sparse.FloatTensor have the same dtype
2) Adding a layout parameter to python functions, where the combination of (dtype, layout) maps to an ATen type that is used for dispatch.
* Formatting, make init throw python_error.
* Fix cuda not enabled error message.
* Fix test.
2018-04-02 18:07:50 +00:00
|
|
|
{"layout", (getter)Tensor_layout, nullptr, nullptr, nullptr},
|
2018-03-01 19:06:55 +00:00
|
|
|
{"is_cuda", (getter)Tensor_is_cuda, nullptr, nullptr, nullptr},
|
|
|
|
|
{"is_sparse", (getter)Tensor_is_sparse, nullptr, nullptr, nullptr},
|
|
|
|
|
{nullptr}
|
2018-02-23 23:03:31 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
|
|
static PyTypeObject metaclass;
|
|
|
|
|
|
|
|
|
|
static void py_initialize_metaclass(PyTypeObject& metaclass) {
|
|
|
|
|
((PyObject*)&metaclass)->ob_refcnt = 1;
|
|
|
|
|
metaclass.tp_basicsize = sizeof(PyTypeObject);
|
|
|
|
|
metaclass.tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE;
|
|
|
|
|
metaclass.tp_methods = metaclass_methods;
|
2018-03-01 19:06:55 +00:00
|
|
|
metaclass.tp_getset = metaclass_properties;
|
2018-02-23 23:03:31 +00:00
|
|
|
metaclass.tp_name = "torch.tensortype";
|
|
|
|
|
metaclass.tp_base = &PyType_Type;
|
|
|
|
|
if (PyType_Ready(&metaclass) < 0) {
|
|
|
|
|
throw python_error();
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void py_initialize_tensor_type(PyTypeObject& type, const char* name, PyObject* tp_dict) {
|
|
|
|
|
// NOTE: we don't use the typical static declaration of PyTypeObject because
|
|
|
|
|
// we need to initialize as many types as there are VariableType instances.
|
2018-08-30 23:22:24 +00:00
|
|
|
// The typical PyVarObject_HEAD_INIT(nullptr, 0) is described in the Python
|
2018-02-23 23:03:31 +00:00
|
|
|
// documentation: it initializes the refcnt to 1 and the other object header
|
|
|
|
|
// fields to zero.
|
|
|
|
|
memset(&type, 0, sizeof(PyTypeObject));
|
|
|
|
|
((PyObject*)&type)->ob_refcnt = 1;
|
|
|
|
|
((PyObject*)&type)->ob_type = &metaclass;
|
|
|
|
|
type.tp_basicsize = sizeof(PyTensorType);
|
|
|
|
|
type.tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE;
|
|
|
|
|
type.tp_name = name;
|
|
|
|
|
type.tp_new = Tensor_new;
|
|
|
|
|
if (PyType_Ready(&type) < 0) {
|
|
|
|
|
throw python_error();
|
|
|
|
|
}
|
|
|
|
|
if (PyDict_Merge(type.tp_dict, tp_dict, 0) < 0) {
|
|
|
|
|
throw python_error();
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static const char* get_module(Backend backend) {
|
|
|
|
|
switch (backend) {
|
2018-08-19 00:25:26 +00:00
|
|
|
case Backend::CPU: return "torch";
|
|
|
|
|
case Backend::CUDA: return "torch.cuda";
|
|
|
|
|
case Backend::SparseCPU: return "torch.sparse";
|
|
|
|
|
case Backend::SparseCUDA: return "torch.cuda.sparse";
|
Make AT_ASSERT/AT_ERROR non-printf based, other tweaks (#7104)
* Make AT_ASSERT/AT_ERROR non-printf based, other tweaks
- AT_ASSERT/AT_ERROR don't take printf strings anymore; instead,
they take a comma-separated list of things you wanted to print
(bringing it inline with Caffe2's conventions).
Instead of AT_ASSERT(x == 0, "%d is not zero", x)
you write AT_ASSERT(x == 0, x, " is not zero")
This is done by way of a new variadic template at::str(), which
takes a list of arguments and cats their string reps (as per
operator<<) together.
- A bunch of the demangling logic that was in Error.h is now
moved to Error.cpp (better header hygiene.) Also, demangle
has been moved out to its own helper function, and also
a new helper demangle_type (from Caffe2) added.
- A bunch of AT_ASSERT converted into AT_CHECK, to more properly
convey which checks can be caused by user error, and which are
due to logic error in ATen.
Signed-off-by: Edward Z. Yang <ezyang@fb.com>
* CR
Signed-off-by: Edward Z. Yang <ezyang@fb.com>
* Fix test failure.
Signed-off-by: Edward Z. Yang <ezyang@fb.com>
* buildfix
Signed-off-by: Edward Z. Yang <ezyang@fb.com>
* More fixes.
Signed-off-by: Edward Z. Yang <ezyang@fb.com>
* One more fix
Signed-off-by: Edward Z. Yang <ezyang@fb.com>
* Try harder
Signed-off-by: Edward Z. Yang <ezyang@fb.com>
2018-05-01 14:28:31 +00:00
|
|
|
default: AT_ERROR("invalid backend: ", toString(backend));
|
2018-02-23 23:03:31 +00:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static std::string get_name(Backend backend, ScalarType scalarType) {
|
|
|
|
|
std::ostringstream ss;
|
2018-11-22 07:04:43 +00:00
|
|
|
ss << get_module(backend) << "." << toString(scalarType) << "Tensor";
|
2018-02-23 23:03:31 +00:00
|
|
|
return ss.str();
|
|
|
|
|
}
|
|
|
|
|
|
2019-04-22 04:12:21 +00:00
|
|
|
static THPObjectPtr get_storage_obj(const Type& type, const ScalarType scalar_type) {
|
2018-04-03 20:29:25 +00:00
|
|
|
auto module_name = get_module(type.backend());
|
2018-03-01 19:06:55 +00:00
|
|
|
auto module_obj = THPObjectPtr(PyImport_ImportModule(module_name));
|
|
|
|
|
if (!module_obj) throw python_error();
|
|
|
|
|
|
2019-04-22 04:12:21 +00:00
|
|
|
auto storage_name = std::string(toString(scalar_type)) + "Storage";
|
2018-03-01 19:06:55 +00:00
|
|
|
THPObjectPtr storage(PyObject_GetAttrString(module_obj.get(), storage_name.c_str()));
|
|
|
|
|
if (!storage.get()) {
|
|
|
|
|
throw TypeError("couldn't find storage object %s", storage_name.c_str());
|
|
|
|
|
}
|
|
|
|
|
return storage;
|
|
|
|
|
}
|
|
|
|
|
|
2018-02-23 23:03:31 +00:00
|
|
|
static void set_type(PyTensorType& type_obj, Backend backend, ScalarType scalarType) {
|
Split libATen.so into libATen_cpu.so and libATen_cuda.so (#7275)
* Split libATen.so into libATen_cpu.so and libATen_cuda.so
Previously, ATen could be built with either CPU-only support, or
CPU/CUDA support, but only via a compile-time flag, requiring
two separate builds. This means that if you have a program which
indirectly uses a CPU-only build of ATen, and a CPU/CUDA-build of
ATen, you're gonna have a bad time. And you might want a CPU-only
build of ATen, because it is 15M (versus the 300M of a CUDA build).
This commit splits libATen.so into two libraries, CPU/CUDA, so
that it's not necessary to do a full rebuild to get CPU-only
support; instead, if you link against libATen_cpu.so only, you
are CPU-only; if you additionally link/dlopen libATen_cuda.so,
this enables CUDA support. This brings ATen's dynamic library
structure more similar to Caffe2's. libATen.so is no more
(this is BC BREAKING)
The general principle for how this works is that we introduce
a *hooks* interface, which introduces a dynamic dispatch indirection
between a call site and implementation site of CUDA functionality,
mediated by a static initialization registry. This means that we can continue
to, for example, lazily initialize CUDA from Context (a core, CPU class) without
having a direct dependency on the CUDA bits. Instead, we look up
in the registry if, e.g., CUDA hooks have been loaded (this loading
process happens at static initialization time), and if they
have been we dynamic dispatch to this class. We similarly use
the hooks interface to handle Variable registration.
We introduce a new invariant: if the backend of a type has not
been initialized (e.g., it's library has not been dlopened; for
CUDA, this also includes CUDA initialization), then the Type
pointers in the context registry are NULL. If you access the
registry directly you must maintain this invariant.
There are a few potholes along the way. I document them here:
- Previously, PyTorch maintained a separate registry for variable
types, because no provision for them was made in the Context's
type_registry. Now that we have the hooks mechanism, we can easily
have PyTorch register variables in the main registry. The code
has been refactored accordingly.
- There is a subtle ordering issue between Variable and CUDA.
We permit libATen_cuda.so and PyTorch to be loaded in either
order (in practice, CUDA is always loaded "after" PyTorch, because
it is lazily initialized.) This means that, when CUDA types are
loaded, we must subsequently also initialize their Variable equivalents.
Appropriate hooks were added to VariableHooks to make this possible;
similarly, getVariableHooks() is not referentially transparent, and
will change behavior after Variables are loaded. (This is different
to CUDAHooks, which is "burned in" after you try to initialize CUDA.)
- The cmake is adjusted to separate dependencies into either CPU
or CUDA dependencies. The generator scripts are adjusted to either
generate a file as a CUDA (cuda_file_manager) or CPU file (file_manager).
- I changed all native functions which were CUDA-only (the cudnn functions)
to have dispatches for CUDA only (making it permissible to not specify
all dispatch options.) This uncovered a bug in how we were handling
native functions which dispatch on a Type argument; I introduced a new
self_ty keyword to handle this case. I'm not 100% happy about it
but it fixed my problem.
This also exposed the fact that set_history incompletely handles
heterogenous return tuples combining Tensor and TensorList. I
swapped this codegen to use flatten() (at the possible cost of
a slight perf regression, since we're allocating another vector now
in this code path).
- thc_state is no longer a public member of Context; use getTHCState() instead
- This PR comes with Registry from Caffe2, for handling static initialization.
I needed to make a bunch of fixes to Registry to make it more portable
- No more ##__VA_ARGS__ token pasting; instead, it is mandatory to pass at
least one argument to the var-args. CUDAHooks and VariableHooks pass a nullary
struct CUDAHooksArgs/VariableHooksArgs to solve the problem. We must get rid of
token pasting because it does not work with MSVC.
- It seems MSVC is not willing to generate code for constructors of template
classes at use sites which cross DLL boundaries. So we explicitly instantiate
the class to get around the problem. This involved tweaks to the boilerplate
generating macros, and also required us to shuffle around namespaces a bit,
because you can't specialize a template unless you are in the same namespace as
the template.
- Insertion of AT_API to appropriate places where the registry must be exported
- We have a general problem which is that on recent Ubuntu distributions,
--as-needed is enabled for shared libraries, which is (cc @apaszke who was
worrying about this in #7160 see also #7160 (comment)). For now, I've hacked
this up in the PR to pass -Wl,--no-as-needed to all of the spots necessary to
make CI work, but a more sustainable solution is to attempt to dlopen
libATen_cuda.so when CUDA functionality is requested.
- The JIT tests somehow manage to try to touch CUDA without loading libATen_cuda.so. So
we pass -Wl,--no-as-needed when linking libATen_cuda.so to _C.so
- There is a very subtle linking issue with lapack, which is solved by making sure libATen_cuda.so links against LAPACK. There's a comment in aten/src/ATen/CMakeLists.txt about htis as well as a follow up bug at #7353
- autogradpp used AT_CUDA_ENABLED directly. We've expunged these uses and added
a few more things to CUDAHooks (getNumGPUs)
- Added manualSeedAll to Generator so that we can invoke it polymorphically (it
only does something different for CUDAGenerator)
- There's a new cuda/CUDAConfig.h header for CUDA-only ifdef macros (AT_CUDNN_ENABLED, most prominently)
- CUDAHooks/VariableHooks structs live in at namespace because Registry's
namespace support is not good enough to handle it otherwise (see Registry
changes above)
- There's some modest moving around of native functions in ReduceOps and
UnaryOps to get the CUDA-only function implementations into separate files, so
they are only compiled into libATen_cuda.so. sspaddmm needed a separate CUDA
function due to object linkage boundaries.
- Some direct uses of native functions in CUDA code has to go away, since these
functions are not exported, so you have to go through the dispatcher
(at::native::empty_like to at::empty_like)
- Code in THC/THCS/THCUNN now properly use THC_API macro instead of TH_API
(which matters now that TH and THC are not in the same library)
- Added code debt in torch/_thnn/utils.py and other THNN parsing code to handle
both TH_API and THC_API
- TensorUtils.h is now properly exported with AT_API
- Dead uses of TH_EXPORTS and co expunged; we now use ATen_cpu_exports and
ATen_cuda_exports (new, in ATenCUDAGeneral.h) consistently
- Fix some incorrect type annotations on _cudnn_rnn_backward, where we didn't
declare a type as possibly undefined when we should have. We didn't catch this
previously because optional annotations are not tested on "pass-through" native
ATen ops (which don't have dispatch). Upstream issue at #7316
- There's a new cmake macro aten_compile_options for applying all of our
per-target compile time options. We use this on the cpu and cuda libraries.
- test/test_cpp_extensions.py can be run directly by invoking in Python,
assuming you've setup your PYTHONPATH setup correctly
- type_from_string does some new funny business to only query for all valid CUDA
types (which causes CUDA initialization) when we see "torch.cuda." in the
requested string
Signed-off-by: Edward Z. Yang <ezyang@fb.com>
* Last mile libtorch fixes
Signed-off-by: Edward Z. Yang <ezyang@fb.com>
* pedantic fix
Signed-off-by: Edward Z. Yang <ezyang@fb.com>
2018-05-10 17:28:33 +00:00
|
|
|
// This field is lazily initialized from backend and scalar_type
|
|
|
|
|
type_obj.aten_type_ = nullptr;
|
|
|
|
|
type_obj.backend = static_cast<int>(backend);
|
|
|
|
|
type_obj.scalar_type = static_cast<int>(scalarType);
|
Introduce torch.layout and split layout from dtypes. (#6145)
* Introduce torch.layout and split layout from dtypes.
Tensors (and tensor types) now have a 'layout' attribute that returns either 'torch.strided' or 'torch.sparse_coo'.
Previously, dtypes were 1-to-1 with ATen types/PyTensorTypes; the impetus behind this decision was to make things easy in the common case
(i.e. specifying a type in a factory function). But this doesn't really follow for sparity, which isn't a common case.
It also doesn't properly represent the concept or a dtype, which in numpy are proper scalar types (i.e. roughly the type returned from indexing the
last dimension of an n-d array). But this should be the same whether or not the tensor is represented via strides, sparsity, etc.
This is accomplished by:
1) having the dtype of tensor return the (device-type, scalar-type) combination, i.e. torch.cuda.float32, so both
torch.cuda.FloatTensor and torch.cuda.sparse.FloatTensor have the same dtype
2) Adding a layout parameter to python functions, where the combination of (dtype, layout) maps to an ATen type that is used for dispatch.
* Formatting, make init throw python_error.
* Fix cuda not enabled error message.
* Fix test.
2018-04-02 18:07:50 +00:00
|
|
|
type_obj.layout = torch::getLayout(backend);
|
2018-04-12 18:05:44 +00:00
|
|
|
type_obj.dtype = torch::getDtype(scalarType);
|
|
|
|
|
type_obj.is_cuda = (backend == at::Backend::CUDA || backend == at::Backend::SparseCUDA);
|
2018-02-23 23:03:31 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void set_name(PyTensorType& type_obj, const std::string& name) {
|
|
|
|
|
size_t n = sizeof(type_obj.name);
|
|
|
|
|
strncpy(type_obj.name, name.c_str(), n);
|
|
|
|
|
type_obj.name[n - 1] = '\0';
|
|
|
|
|
}
|
|
|
|
|
|
2018-04-03 20:29:25 +00:00
|
|
|
static THPObjectPtr get_tensor_dict() {
|
|
|
|
|
auto torch = THPObjectPtr(PyImport_ImportModule("torch"));
|
|
|
|
|
if (!torch) throw python_error();
|
2018-02-23 23:03:31 +00:00
|
|
|
|
2018-04-03 20:29:25 +00:00
|
|
|
auto tensor_class = THPObjectPtr(PyObject_GetAttrString(torch, "Tensor"));
|
|
|
|
|
if (!tensor_class) throw python_error();
|
2018-02-23 23:03:31 +00:00
|
|
|
|
2018-04-03 20:29:25 +00:00
|
|
|
auto tensor_type = (PyTypeObject*)tensor_class.get();
|
2018-07-25 00:59:44 +00:00
|
|
|
AT_CHECK(tensor_type->tp_base, "missing base type for Tensor");
|
2018-02-23 23:03:31 +00:00
|
|
|
|
|
|
|
|
auto res = THPObjectPtr(PyDict_New());
|
|
|
|
|
if (!res) throw python_error();
|
|
|
|
|
|
2018-04-03 20:29:25 +00:00
|
|
|
if (PyDict_Merge(res.get(), tensor_type->tp_dict, 0) < 0) {
|
2018-02-23 23:03:31 +00:00
|
|
|
throw python_error();
|
|
|
|
|
}
|
2018-04-03 20:29:25 +00:00
|
|
|
if (PyDict_Merge(res.get(), tensor_type->tp_base->tp_dict, 0) < 0) {
|
2018-02-23 23:03:31 +00:00
|
|
|
throw python_error();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return res;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static std::vector<PyTensorType> tensor_types;
|
|
|
|
|
|
|
|
|
|
static void initialize_aten_types(std::vector<PyTensorType>& tensor_types) {
|
|
|
|
|
// includes CUDA types even when PyTorch is not built with CUDA
|
|
|
|
|
auto declared_types = torch::utils::all_declared_types();
|
2018-04-03 20:29:25 +00:00
|
|
|
tensor_types.resize(declared_types.size());
|
2018-02-23 23:03:31 +00:00
|
|
|
|
|
|
|
|
for (size_t i = 0, end = declared_types.size(); i != end; i++) {
|
|
|
|
|
auto& tensor_type = tensor_types[i];
|
|
|
|
|
Backend backend = declared_types[i].first;
|
|
|
|
|
ScalarType scalar_type = declared_types[i].second;
|
|
|
|
|
set_type(tensor_type, backend, scalar_type);
|
|
|
|
|
set_name(tensor_type, get_name(backend, scalar_type));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2018-02-27 22:58:09 +00:00
|
|
|
void initialize_python_bindings() {
|
2018-02-23 23:03:31 +00:00
|
|
|
// Initialize the at::Type* pointers, name, and properties of the PyTensorType
|
|
|
|
|
// vector. After this call, the vector must not be resized.
|
|
|
|
|
initialize_aten_types(tensor_types);
|
|
|
|
|
|
2018-04-03 20:29:25 +00:00
|
|
|
// Initialize the Python metaclass for the torch.FloatTensor, etc. types.
|
|
|
|
|
// The metaclass handles __instancecheck__ checks and binds the dtype property
|
|
|
|
|
// on the type objects.
|
2018-02-23 23:03:31 +00:00
|
|
|
py_initialize_metaclass(metaclass);
|
|
|
|
|
|
|
|
|
|
// Get the tp_dict of the Variable class. We copy function definitions
|
|
|
|
|
// onto each Tensor type object so that they can be accessed via e.g.
|
2018-04-03 20:29:25 +00:00
|
|
|
// `torch.FloatTensor.add`.
|
|
|
|
|
auto tensor_dict = get_tensor_dict();
|
2018-02-23 23:03:31 +00:00
|
|
|
|
2018-04-03 20:29:25 +00:00
|
|
|
// Initialize each Python type object torch.FloatTensor, torch.DoubleTensor, etc.
|
2018-02-23 23:03:31 +00:00
|
|
|
for (auto& tensor_type : tensor_types) {
|
2018-04-03 20:29:25 +00:00
|
|
|
py_initialize_tensor_type(tensor_type.py_type, tensor_type.name, tensor_dict.get());
|
2018-02-23 23:03:31 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Add the type objects to their corresponding modules. e.g. torch.FloatTensor
|
|
|
|
|
// is added to the `torch` module as `FloatTensor`. Also add all the type
|
|
|
|
|
// objects to the set torch._tensor_classes.
|
|
|
|
|
py_bind_tensor_types(tensor_types);
|
2018-03-01 19:06:55 +00:00
|
|
|
|
2018-04-03 20:29:25 +00:00
|
|
|
// Use torch.float32 as the default tensor type
|
2019-04-22 04:12:21 +00:00
|
|
|
set_default_tensor_type(at::globalContext().getVariableType(at::Backend::CPU, at::kFloat), at::kFloat);
|
2018-02-23 23:03:31 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void py_bind_tensor_types(const std::vector<PyTensorType>& tensor_types) {
|
|
|
|
|
auto torch_module = THPObjectPtr(PyImport_ImportModule("torch"));
|
|
|
|
|
if (!torch_module) throw python_error();
|
|
|
|
|
|
|
|
|
|
auto tensor_classes = THPObjectPtr(PyObject_GetAttrString(torch_module.get(), "_tensor_classes"));
|
|
|
|
|
if (!tensor_classes) throw python_error();
|
|
|
|
|
|
|
|
|
|
for (auto& tensor_type : tensor_types) {
|
|
|
|
|
auto name = std::string(tensor_type.name);
|
2018-12-14 21:30:35 +00:00
|
|
|
auto idx = name.rfind('.');
|
2018-02-23 23:03:31 +00:00
|
|
|
auto type_name = name.substr(idx + 1);
|
|
|
|
|
auto module_name = name.substr(0, idx);
|
|
|
|
|
|
|
|
|
|
auto module_obj = THPObjectPtr(PyImport_ImportModule(module_name.c_str()));
|
|
|
|
|
if (!module_obj) throw python_error();
|
|
|
|
|
|
|
|
|
|
PyObject* type_obj = (PyObject*)&tensor_type;
|
|
|
|
|
Py_INCREF(type_obj);
|
|
|
|
|
if (PyModule_AddObject(module_obj.get(), type_name.c_str(), type_obj) < 0) {
|
|
|
|
|
throw python_error();
|
|
|
|
|
}
|
|
|
|
|
if (PySet_Add(tensor_classes.get(), type_obj) < 0) {
|
|
|
|
|
throw python_error();
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static bool PyTensorType_Check(PyObject* obj) {
|
|
|
|
|
auto it = std::find_if(tensor_types.begin(), tensor_types.end(),
|
|
|
|
|
[obj](const PyTensorType& x) {
|
|
|
|
|
return (PyObject*)&x == obj;
|
|
|
|
|
});
|
|
|
|
|
return it != tensor_types.end();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void py_set_default_tensor_type(PyObject* obj) {
|
2018-03-01 19:06:55 +00:00
|
|
|
PyTensorType *type;
|
|
|
|
|
if (PyTensorType_Check(obj)) {
|
|
|
|
|
type = (PyTensorType*)obj;
|
2018-04-16 17:49:00 +00:00
|
|
|
} else {
|
|
|
|
|
throw TypeError("invalid type object");
|
|
|
|
|
}
|
Split libATen.so into libATen_cpu.so and libATen_cuda.so (#7275)
* Split libATen.so into libATen_cpu.so and libATen_cuda.so
Previously, ATen could be built with either CPU-only support, or
CPU/CUDA support, but only via a compile-time flag, requiring
two separate builds. This means that if you have a program which
indirectly uses a CPU-only build of ATen, and a CPU/CUDA-build of
ATen, you're gonna have a bad time. And you might want a CPU-only
build of ATen, because it is 15M (versus the 300M of a CUDA build).
This commit splits libATen.so into two libraries, CPU/CUDA, so
that it's not necessary to do a full rebuild to get CPU-only
support; instead, if you link against libATen_cpu.so only, you
are CPU-only; if you additionally link/dlopen libATen_cuda.so,
this enables CUDA support. This brings ATen's dynamic library
structure more similar to Caffe2's. libATen.so is no more
(this is BC BREAKING)
The general principle for how this works is that we introduce
a *hooks* interface, which introduces a dynamic dispatch indirection
between a call site and implementation site of CUDA functionality,
mediated by a static initialization registry. This means that we can continue
to, for example, lazily initialize CUDA from Context (a core, CPU class) without
having a direct dependency on the CUDA bits. Instead, we look up
in the registry if, e.g., CUDA hooks have been loaded (this loading
process happens at static initialization time), and if they
have been we dynamic dispatch to this class. We similarly use
the hooks interface to handle Variable registration.
We introduce a new invariant: if the backend of a type has not
been initialized (e.g., it's library has not been dlopened; for
CUDA, this also includes CUDA initialization), then the Type
pointers in the context registry are NULL. If you access the
registry directly you must maintain this invariant.
There are a few potholes along the way. I document them here:
- Previously, PyTorch maintained a separate registry for variable
types, because no provision for them was made in the Context's
type_registry. Now that we have the hooks mechanism, we can easily
have PyTorch register variables in the main registry. The code
has been refactored accordingly.
- There is a subtle ordering issue between Variable and CUDA.
We permit libATen_cuda.so and PyTorch to be loaded in either
order (in practice, CUDA is always loaded "after" PyTorch, because
it is lazily initialized.) This means that, when CUDA types are
loaded, we must subsequently also initialize their Variable equivalents.
Appropriate hooks were added to VariableHooks to make this possible;
similarly, getVariableHooks() is not referentially transparent, and
will change behavior after Variables are loaded. (This is different
to CUDAHooks, which is "burned in" after you try to initialize CUDA.)
- The cmake is adjusted to separate dependencies into either CPU
or CUDA dependencies. The generator scripts are adjusted to either
generate a file as a CUDA (cuda_file_manager) or CPU file (file_manager).
- I changed all native functions which were CUDA-only (the cudnn functions)
to have dispatches for CUDA only (making it permissible to not specify
all dispatch options.) This uncovered a bug in how we were handling
native functions which dispatch on a Type argument; I introduced a new
self_ty keyword to handle this case. I'm not 100% happy about it
but it fixed my problem.
This also exposed the fact that set_history incompletely handles
heterogenous return tuples combining Tensor and TensorList. I
swapped this codegen to use flatten() (at the possible cost of
a slight perf regression, since we're allocating another vector now
in this code path).
- thc_state is no longer a public member of Context; use getTHCState() instead
- This PR comes with Registry from Caffe2, for handling static initialization.
I needed to make a bunch of fixes to Registry to make it more portable
- No more ##__VA_ARGS__ token pasting; instead, it is mandatory to pass at
least one argument to the var-args. CUDAHooks and VariableHooks pass a nullary
struct CUDAHooksArgs/VariableHooksArgs to solve the problem. We must get rid of
token pasting because it does not work with MSVC.
- It seems MSVC is not willing to generate code for constructors of template
classes at use sites which cross DLL boundaries. So we explicitly instantiate
the class to get around the problem. This involved tweaks to the boilerplate
generating macros, and also required us to shuffle around namespaces a bit,
because you can't specialize a template unless you are in the same namespace as
the template.
- Insertion of AT_API to appropriate places where the registry must be exported
- We have a general problem which is that on recent Ubuntu distributions,
--as-needed is enabled for shared libraries, which is (cc @apaszke who was
worrying about this in #7160 see also #7160 (comment)). For now, I've hacked
this up in the PR to pass -Wl,--no-as-needed to all of the spots necessary to
make CI work, but a more sustainable solution is to attempt to dlopen
libATen_cuda.so when CUDA functionality is requested.
- The JIT tests somehow manage to try to touch CUDA without loading libATen_cuda.so. So
we pass -Wl,--no-as-needed when linking libATen_cuda.so to _C.so
- There is a very subtle linking issue with lapack, which is solved by making sure libATen_cuda.so links against LAPACK. There's a comment in aten/src/ATen/CMakeLists.txt about htis as well as a follow up bug at #7353
- autogradpp used AT_CUDA_ENABLED directly. We've expunged these uses and added
a few more things to CUDAHooks (getNumGPUs)
- Added manualSeedAll to Generator so that we can invoke it polymorphically (it
only does something different for CUDAGenerator)
- There's a new cuda/CUDAConfig.h header for CUDA-only ifdef macros (AT_CUDNN_ENABLED, most prominently)
- CUDAHooks/VariableHooks structs live in at namespace because Registry's
namespace support is not good enough to handle it otherwise (see Registry
changes above)
- There's some modest moving around of native functions in ReduceOps and
UnaryOps to get the CUDA-only function implementations into separate files, so
they are only compiled into libATen_cuda.so. sspaddmm needed a separate CUDA
function due to object linkage boundaries.
- Some direct uses of native functions in CUDA code has to go away, since these
functions are not exported, so you have to go through the dispatcher
(at::native::empty_like to at::empty_like)
- Code in THC/THCS/THCUNN now properly use THC_API macro instead of TH_API
(which matters now that TH and THC are not in the same library)
- Added code debt in torch/_thnn/utils.py and other THNN parsing code to handle
both TH_API and THC_API
- TensorUtils.h is now properly exported with AT_API
- Dead uses of TH_EXPORTS and co expunged; we now use ATen_cpu_exports and
ATen_cuda_exports (new, in ATenCUDAGeneral.h) consistently
- Fix some incorrect type annotations on _cudnn_rnn_backward, where we didn't
declare a type as possibly undefined when we should have. We didn't catch this
previously because optional annotations are not tested on "pass-through" native
ATen ops (which don't have dispatch). Upstream issue at #7316
- There's a new cmake macro aten_compile_options for applying all of our
per-target compile time options. We use this on the cpu and cuda libraries.
- test/test_cpp_extensions.py can be run directly by invoking in Python,
assuming you've setup your PYTHONPATH setup correctly
- type_from_string does some new funny business to only query for all valid CUDA
types (which causes CUDA initialization) when we see "torch.cuda." in the
requested string
Signed-off-by: Edward Z. Yang <ezyang@fb.com>
* Last mile libtorch fixes
Signed-off-by: Edward Z. Yang <ezyang@fb.com>
* pedantic fix
Signed-off-by: Edward Z. Yang <ezyang@fb.com>
2018-05-10 17:28:33 +00:00
|
|
|
auto aten_type = type->aten_type();
|
2019-04-22 04:12:21 +00:00
|
|
|
auto scalar_type = static_cast<ScalarType>(type->scalar_type);
|
Split libATen.so into libATen_cpu.so and libATen_cuda.so (#7275)
* Split libATen.so into libATen_cpu.so and libATen_cuda.so
Previously, ATen could be built with either CPU-only support, or
CPU/CUDA support, but only via a compile-time flag, requiring
two separate builds. This means that if you have a program which
indirectly uses a CPU-only build of ATen, and a CPU/CUDA-build of
ATen, you're gonna have a bad time. And you might want a CPU-only
build of ATen, because it is 15M (versus the 300M of a CUDA build).
This commit splits libATen.so into two libraries, CPU/CUDA, so
that it's not necessary to do a full rebuild to get CPU-only
support; instead, if you link against libATen_cpu.so only, you
are CPU-only; if you additionally link/dlopen libATen_cuda.so,
this enables CUDA support. This brings ATen's dynamic library
structure more similar to Caffe2's. libATen.so is no more
(this is BC BREAKING)
The general principle for how this works is that we introduce
a *hooks* interface, which introduces a dynamic dispatch indirection
between a call site and implementation site of CUDA functionality,
mediated by a static initialization registry. This means that we can continue
to, for example, lazily initialize CUDA from Context (a core, CPU class) without
having a direct dependency on the CUDA bits. Instead, we look up
in the registry if, e.g., CUDA hooks have been loaded (this loading
process happens at static initialization time), and if they
have been we dynamic dispatch to this class. We similarly use
the hooks interface to handle Variable registration.
We introduce a new invariant: if the backend of a type has not
been initialized (e.g., it's library has not been dlopened; for
CUDA, this also includes CUDA initialization), then the Type
pointers in the context registry are NULL. If you access the
registry directly you must maintain this invariant.
There are a few potholes along the way. I document them here:
- Previously, PyTorch maintained a separate registry for variable
types, because no provision for them was made in the Context's
type_registry. Now that we have the hooks mechanism, we can easily
have PyTorch register variables in the main registry. The code
has been refactored accordingly.
- There is a subtle ordering issue between Variable and CUDA.
We permit libATen_cuda.so and PyTorch to be loaded in either
order (in practice, CUDA is always loaded "after" PyTorch, because
it is lazily initialized.) This means that, when CUDA types are
loaded, we must subsequently also initialize their Variable equivalents.
Appropriate hooks were added to VariableHooks to make this possible;
similarly, getVariableHooks() is not referentially transparent, and
will change behavior after Variables are loaded. (This is different
to CUDAHooks, which is "burned in" after you try to initialize CUDA.)
- The cmake is adjusted to separate dependencies into either CPU
or CUDA dependencies. The generator scripts are adjusted to either
generate a file as a CUDA (cuda_file_manager) or CPU file (file_manager).
- I changed all native functions which were CUDA-only (the cudnn functions)
to have dispatches for CUDA only (making it permissible to not specify
all dispatch options.) This uncovered a bug in how we were handling
native functions which dispatch on a Type argument; I introduced a new
self_ty keyword to handle this case. I'm not 100% happy about it
but it fixed my problem.
This also exposed the fact that set_history incompletely handles
heterogenous return tuples combining Tensor and TensorList. I
swapped this codegen to use flatten() (at the possible cost of
a slight perf regression, since we're allocating another vector now
in this code path).
- thc_state is no longer a public member of Context; use getTHCState() instead
- This PR comes with Registry from Caffe2, for handling static initialization.
I needed to make a bunch of fixes to Registry to make it more portable
- No more ##__VA_ARGS__ token pasting; instead, it is mandatory to pass at
least one argument to the var-args. CUDAHooks and VariableHooks pass a nullary
struct CUDAHooksArgs/VariableHooksArgs to solve the problem. We must get rid of
token pasting because it does not work with MSVC.
- It seems MSVC is not willing to generate code for constructors of template
classes at use sites which cross DLL boundaries. So we explicitly instantiate
the class to get around the problem. This involved tweaks to the boilerplate
generating macros, and also required us to shuffle around namespaces a bit,
because you can't specialize a template unless you are in the same namespace as
the template.
- Insertion of AT_API to appropriate places where the registry must be exported
- We have a general problem which is that on recent Ubuntu distributions,
--as-needed is enabled for shared libraries, which is (cc @apaszke who was
worrying about this in #7160 see also #7160 (comment)). For now, I've hacked
this up in the PR to pass -Wl,--no-as-needed to all of the spots necessary to
make CI work, but a more sustainable solution is to attempt to dlopen
libATen_cuda.so when CUDA functionality is requested.
- The JIT tests somehow manage to try to touch CUDA without loading libATen_cuda.so. So
we pass -Wl,--no-as-needed when linking libATen_cuda.so to _C.so
- There is a very subtle linking issue with lapack, which is solved by making sure libATen_cuda.so links against LAPACK. There's a comment in aten/src/ATen/CMakeLists.txt about htis as well as a follow up bug at #7353
- autogradpp used AT_CUDA_ENABLED directly. We've expunged these uses and added
a few more things to CUDAHooks (getNumGPUs)
- Added manualSeedAll to Generator so that we can invoke it polymorphically (it
only does something different for CUDAGenerator)
- There's a new cuda/CUDAConfig.h header for CUDA-only ifdef macros (AT_CUDNN_ENABLED, most prominently)
- CUDAHooks/VariableHooks structs live in at namespace because Registry's
namespace support is not good enough to handle it otherwise (see Registry
changes above)
- There's some modest moving around of native functions in ReduceOps and
UnaryOps to get the CUDA-only function implementations into separate files, so
they are only compiled into libATen_cuda.so. sspaddmm needed a separate CUDA
function due to object linkage boundaries.
- Some direct uses of native functions in CUDA code has to go away, since these
functions are not exported, so you have to go through the dispatcher
(at::native::empty_like to at::empty_like)
- Code in THC/THCS/THCUNN now properly use THC_API macro instead of TH_API
(which matters now that TH and THC are not in the same library)
- Added code debt in torch/_thnn/utils.py and other THNN parsing code to handle
both TH_API and THC_API
- TensorUtils.h is now properly exported with AT_API
- Dead uses of TH_EXPORTS and co expunged; we now use ATen_cpu_exports and
ATen_cuda_exports (new, in ATenCUDAGeneral.h) consistently
- Fix some incorrect type annotations on _cudnn_rnn_backward, where we didn't
declare a type as possibly undefined when we should have. We didn't catch this
previously because optional annotations are not tested on "pass-through" native
ATen ops (which don't have dispatch). Upstream issue at #7316
- There's a new cmake macro aten_compile_options for applying all of our
per-target compile time options. We use this on the cpu and cuda libraries.
- test/test_cpp_extensions.py can be run directly by invoking in Python,
assuming you've setup your PYTHONPATH setup correctly
- type_from_string does some new funny business to only query for all valid CUDA
types (which causes CUDA initialization) when we see "torch.cuda." in the
requested string
Signed-off-by: Edward Z. Yang <ezyang@fb.com>
* Last mile libtorch fixes
Signed-off-by: Edward Z. Yang <ezyang@fb.com>
* pedantic fix
Signed-off-by: Edward Z. Yang <ezyang@fb.com>
2018-05-10 17:28:33 +00:00
|
|
|
if (!aten_type) {
|
2018-04-16 17:49:00 +00:00
|
|
|
throw unavailable_type(*type);
|
|
|
|
|
}
|
2019-04-22 04:12:21 +00:00
|
|
|
set_default_tensor_type(*aten_type, scalar_type);
|
2018-04-16 17:49:00 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void py_set_default_dtype(PyObject* obj) {
|
|
|
|
|
if (THPDtype_Check(obj)) {
|
2019-04-22 04:12:21 +00:00
|
|
|
set_default_tensor_type(*default_tensor_type, ((THPDtype*)obj)->scalar_type);
|
2018-03-01 19:06:55 +00:00
|
|
|
} else {
|
2018-02-23 23:03:31 +00:00
|
|
|
throw TypeError("invalid type object");
|
|
|
|
|
}
|
2018-04-03 20:29:25 +00:00
|
|
|
}
|
2018-03-01 19:06:55 +00:00
|
|
|
|
2019-04-22 04:12:21 +00:00
|
|
|
void set_default_tensor_type(const at::Type& type, const ScalarType scalar_type) {
|
|
|
|
|
if (!at::isFloatingType(scalar_type)) {
|
2018-03-10 04:50:18 +00:00
|
|
|
throw TypeError("only floating-point types are supported as the default type");
|
|
|
|
|
}
|
2018-06-14 01:14:29 +00:00
|
|
|
if (!type.is_variable() && !type.is_undefined()) {
|
2018-04-03 20:29:25 +00:00
|
|
|
throw TypeError("only variable types are supported");
|
|
|
|
|
}
|
|
|
|
|
if (type.is_sparse()) {
|
2018-03-10 04:50:18 +00:00
|
|
|
throw TypeError("only dense types are supported as the default type");
|
|
|
|
|
}
|
|
|
|
|
|
2018-03-01 19:06:55 +00:00
|
|
|
// get the storage first, so if it doesn't exist we don't change the default tensor type
|
2019-04-22 04:12:21 +00:00
|
|
|
THPObjectPtr storage = get_storage_obj(type, scalar_type);
|
2018-12-07 20:22:49 +00:00
|
|
|
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
|
2018-04-03 20:29:25 +00:00
|
|
|
default_tensor_type = const_cast<Type*>(&type);
|
2019-04-22 04:12:21 +00:00
|
|
|
at::set_default_dtype(scalarTypeToTypeMeta(scalar_type));
|
2018-03-01 19:06:55 +00:00
|
|
|
|
|
|
|
|
auto torch_module = THPObjectPtr(PyImport_ImportModule("torch"));
|
|
|
|
|
if (!torch_module) throw python_error();
|
|
|
|
|
|
|
|
|
|
if (PyObject_SetAttrString(torch_module.get(), "Storage", storage) != 0) {
|
|
|
|
|
// technically, we should undo the change of default tensor type.
|
|
|
|
|
throw python_error();
|
|
|
|
|
}
|
2018-02-23 23:03:31 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
at::Type& get_default_tensor_type() {
|
2018-07-25 00:59:44 +00:00
|
|
|
AT_ASSERT(default_tensor_type);
|
2018-04-03 20:29:25 +00:00
|
|
|
return *default_tensor_type;
|
2018-02-23 23:03:31 +00:00
|
|
|
}
|
2019-04-04 09:21:09 +00:00
|
|
|
|
|
|
|
|
ScalarType get_default_scalar_type() {
|
|
|
|
|
return typeMetaToScalarType(get_default_dtype());
|
|
|
|
|
}
|
2018-06-26 04:11:49 +00:00
|
|
|
}} // namespace torch::tensors
|