Canonicalize all includes in PyTorch. (#14849)
Summary:
Anywhere we used #include "foo.h", we now say #include <foo.h>
Paths are adjusted to be rooted out of aten/src, torch/lib, or
the root level directory.
I modified CMakeLists.txt by hand to remove TH and THC from
the include paths.
I used the following script to do the canonicalization:
```
import subprocess
import re
import os.path
files = subprocess.check_output(['git', 'ls-files']).decode('utf-8').rstrip().split('\n')
for fn in files:
if not any(fn.endswith(suff) for suff in ['.cu', '.cpp', '.in', '.h', '.hpp', '.cu', '.cuh', '.cc']):
continue
if not any(fn.startswith(pref) for pref in ["aten/", "torch/"]):
continue
with open(fn, 'r') as f:
c = f.read()
def fmt(p):
return "#include <{}>".format(p)
def repl(m):
p = m.group(1)
if p in ["dlfcn.h", "unistd.h", "nvrtc.h", "cuda.h", "cuda_runtime.h", "cstdint", "cudnn.h", "Python.h", "cusparse.h", "cuda_runtime_api.h", "cuda_fp16.h", "cublas_v2.h", "stdint.h", "curand_kernel.h"]:
return fmt(p)
if any(p.startswith(pref) for pref in ["torch/csrc", "c10/", "ATen/", "caffe2/", "TH/", "THC/", "Eigen/", "gtest/", "zdl/", "gloo/", "onnx/", "miopen/"]):
return fmt(p)
for root in ["aten/src", "torch/lib", ""]:
for bad_root in [os.path.dirname(fn), "aten/src/TH", "aten/src/THC", "torch/csrc"]:
new_p = os.path.relpath(os.path.join(bad_root, p), root)
if not new_p.startswith("../") and (os.path.exists(os.path.join(root, new_p)) or os.path.exists(os.path.join(root, new_p + ".in"))):
return fmt(new_p)
print("ERROR: ", fn, p)
return m.group(0)
new_c = re.sub(r'#include "([^"]+)"', repl, c)
if new_c != c:
print(fn)
with open(fn, 'w') as f:
f.write(new_c)
```
Signed-off-by: Edward Z. Yang <ezyang@fb.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/14849
Reviewed By: dzhulgakov
Differential Revision: D13363445
Pulled By: ezyang
fbshipit-source-id: 52361f878a672785f9306c9e9ab2513128092b68
2018-12-09 03:32:01 +00:00
|
|
|
#include <torch/csrc/jit/passes/graph_fuser.h>
|
2018-11-22 01:46:46 +00:00
|
|
|
|
2019-01-24 19:05:07 +00:00
|
|
|
#include <c10/util/Exception.h>
|
2018-12-26 14:52:25 +00:00
|
|
|
#include <torch/csrc/jit/autodiff.h>
|
2019-01-10 20:25:22 +00:00
|
|
|
#include <torch/csrc/jit/custom_operator.h>
|
2018-12-26 14:52:25 +00:00
|
|
|
#include <torch/csrc/jit/fuser/interface.h>
|
|
|
|
|
#include <torch/csrc/jit/operator.h>
|
Canonicalize all includes in PyTorch. (#14849)
Summary:
Anywhere we used #include "foo.h", we now say #include <foo.h>
Paths are adjusted to be rooted out of aten/src, torch/lib, or
the root level directory.
I modified CMakeLists.txt by hand to remove TH and THC from
the include paths.
I used the following script to do the canonicalization:
```
import subprocess
import re
import os.path
files = subprocess.check_output(['git', 'ls-files']).decode('utf-8').rstrip().split('\n')
for fn in files:
if not any(fn.endswith(suff) for suff in ['.cu', '.cpp', '.in', '.h', '.hpp', '.cu', '.cuh', '.cc']):
continue
if not any(fn.startswith(pref) for pref in ["aten/", "torch/"]):
continue
with open(fn, 'r') as f:
c = f.read()
def fmt(p):
return "#include <{}>".format(p)
def repl(m):
p = m.group(1)
if p in ["dlfcn.h", "unistd.h", "nvrtc.h", "cuda.h", "cuda_runtime.h", "cstdint", "cudnn.h", "Python.h", "cusparse.h", "cuda_runtime_api.h", "cuda_fp16.h", "cublas_v2.h", "stdint.h", "curand_kernel.h"]:
return fmt(p)
if any(p.startswith(pref) for pref in ["torch/csrc", "c10/", "ATen/", "caffe2/", "TH/", "THC/", "Eigen/", "gtest/", "zdl/", "gloo/", "onnx/", "miopen/"]):
return fmt(p)
for root in ["aten/src", "torch/lib", ""]:
for bad_root in [os.path.dirname(fn), "aten/src/TH", "aten/src/THC", "torch/csrc"]:
new_p = os.path.relpath(os.path.join(bad_root, p), root)
if not new_p.startswith("../") and (os.path.exists(os.path.join(root, new_p)) or os.path.exists(os.path.join(root, new_p + ".in"))):
return fmt(new_p)
print("ERROR: ", fn, p)
return m.group(0)
new_c = re.sub(r'#include "([^"]+)"', repl, c)
if new_c != c:
print(fn)
with open(fn, 'w') as f:
f.write(new_c)
```
Signed-off-by: Edward Z. Yang <ezyang@fb.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/14849
Reviewed By: dzhulgakov
Differential Revision: D13363445
Pulled By: ezyang
fbshipit-source-id: 52361f878a672785f9306c9e9ab2513128092b68
2018-12-09 03:32:01 +00:00
|
|
|
#include <torch/csrc/jit/passes/alias_analysis.h>
|
|
|
|
|
#include <torch/csrc/jit/passes/common_subexpression_elimination.h>
|
2019-01-11 18:17:54 +00:00
|
|
|
#include <torch/csrc/jit/passes/constant_pooling.h>
|
Canonicalize all includes in PyTorch. (#14849)
Summary:
Anywhere we used #include "foo.h", we now say #include <foo.h>
Paths are adjusted to be rooted out of aten/src, torch/lib, or
the root level directory.
I modified CMakeLists.txt by hand to remove TH and THC from
the include paths.
I used the following script to do the canonicalization:
```
import subprocess
import re
import os.path
files = subprocess.check_output(['git', 'ls-files']).decode('utf-8').rstrip().split('\n')
for fn in files:
if not any(fn.endswith(suff) for suff in ['.cu', '.cpp', '.in', '.h', '.hpp', '.cu', '.cuh', '.cc']):
continue
if not any(fn.startswith(pref) for pref in ["aten/", "torch/"]):
continue
with open(fn, 'r') as f:
c = f.read()
def fmt(p):
return "#include <{}>".format(p)
def repl(m):
p = m.group(1)
if p in ["dlfcn.h", "unistd.h", "nvrtc.h", "cuda.h", "cuda_runtime.h", "cstdint", "cudnn.h", "Python.h", "cusparse.h", "cuda_runtime_api.h", "cuda_fp16.h", "cublas_v2.h", "stdint.h", "curand_kernel.h"]:
return fmt(p)
if any(p.startswith(pref) for pref in ["torch/csrc", "c10/", "ATen/", "caffe2/", "TH/", "THC/", "Eigen/", "gtest/", "zdl/", "gloo/", "onnx/", "miopen/"]):
return fmt(p)
for root in ["aten/src", "torch/lib", ""]:
for bad_root in [os.path.dirname(fn), "aten/src/TH", "aten/src/THC", "torch/csrc"]:
new_p = os.path.relpath(os.path.join(bad_root, p), root)
if not new_p.startswith("../") and (os.path.exists(os.path.join(root, new_p)) or os.path.exists(os.path.join(root, new_p + ".in"))):
return fmt(new_p)
print("ERROR: ", fn, p)
return m.group(0)
new_c = re.sub(r'#include "([^"]+)"', repl, c)
if new_c != c:
print(fn)
with open(fn, 'w') as f:
f.write(new_c)
```
Signed-off-by: Edward Z. Yang <ezyang@fb.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/14849
Reviewed By: dzhulgakov
Differential Revision: D13363445
Pulled By: ezyang
fbshipit-source-id: 52361f878a672785f9306c9e9ab2513128092b68
2018-12-09 03:32:01 +00:00
|
|
|
#include <torch/csrc/jit/passes/dead_code_elimination.h>
|
2019-01-10 20:25:22 +00:00
|
|
|
#include <torch/csrc/jit/passes/utils/subgraph_utils.h>
|
|
|
|
|
#include <torch/csrc/jit/script/compiler.h>
|
2017-07-19 23:10:13 +00:00
|
|
|
|
Get more fusion after autodiff uses SumToSize (#14957)
Summary:
Here is a fresh attempt at getting some fusion back in autodiff-generated graphs in the presence of SumToSize.
- The sum to size operator is now `aten::_grad_sum_to_size` to allow symbolic script differentiation (and that in turn would need to use this in place of sum_to_size to signal that it strictly operates on gradients). This is also used in the autodiff code, replacing `prim::SumToSize`.
- `_grad_sum_to_size` is now fusable, `cat`s - which are fused afterwards thanks to Adam's simplification of the code - are only fused if there is no `_grad_sum_to_size` in the fusion group.
- I push the `_grad_sum_to_size` out of the the fusion group when compiling and record the desired summations in the KernelSpec. The reasoning is the following:
- As the autodiff is a repeated applicaiton of the chain rule, we always have the pattern `grad_in = mm(A, grad_out)`, with A often diagonal for cases interesting to the fuser, whence it is `grad_in = a * grad_out` (a pointwise multiplication). We know that only `grad_out` may have AutodiffGradSumToSize applied, so we can commute AutodiffGradSumToSize with the `mul` (and `div` and `neg` are of similar origin).
- For `type_as` the gradient might be giving the type, so just skip SumToSize,
- `add` (which was inserted as `prim::AutogradAdd`) adding gradients when the forward used the same value in several places. This is non-broadcasting, so we know that the two arguments would have the same sizes as inputs - which is good so we don't have to do bookkeeping of the two parts.
Details:
- During fusion, the Tensor arguments are always kept as the first parameters of the fusion group to accomodate indexing assumptions in the fuser.
- The rewriting of the fusion group to record the necessary output transformation and eliminate `_grad_sum_to_size` from the fusion group is now in the fuser compile step.
- In the execution step, the arguments are split into Tensor / Non-Tensor and the non-tensor args are mostly forgotten about except for doing `sum_to_size` at the end. This would want to be improved if/when we fuse nonconstant scalar arguments.
- In a number of places in the fuser, the non-Tensor arguments to the fusion group needed to be ignored.
Thank you, apaszke for the insightful discussion. All bad ideas and errors are my own.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/14957
Differential Revision: D13888173
Pulled By: zou3519
fbshipit-source-id: 071992c876e8b845f2b3e6329ae03a835d39a0ea
2019-01-31 19:57:56 +00:00
|
|
|
#include <queue>
|
2019-01-11 18:45:40 +00:00
|
|
|
#include <unordered_map>
|
2018-06-21 22:14:51 +00:00
|
|
|
|
2018-12-26 14:52:25 +00:00
|
|
|
namespace torch {
|
|
|
|
|
namespace jit {
|
2017-07-19 23:10:13 +00:00
|
|
|
|
2017-09-12 14:31:49 +00:00
|
|
|
namespace {
|
|
|
|
|
|
2018-11-05 14:27:24 +00:00
|
|
|
// What is a simple mappable operator? It:
|
|
|
|
|
// - Has a single tensor output
|
|
|
|
|
// - Output and all tensor inputs have the same shape
|
|
|
|
|
// - Output and all tensor inputs have the same scalar type
|
Add Type support to the fuser, fuse more (#14336)
Summary:
This adds scalar type support to the fuser, both internally (instead of auto / assuming float) and for the inputs/outputs.
We can now fuse things with input / output of arbitrary scalar type, in particular comparisons and where work well. So it fixes #13384 by returning the right type tensor (and adds a test where byte and double tensors are returned).
The type inference is done by re-calling PropagateTensorShapeOnNode in the compilation, I would venture that it isn't prohibitively expensive compared to the actual compilation. (Propagation was fixed for where to return the second argument's type and amended to handle FusedConcat.)
I'm not sure how to add a check for the code generated by the fuser, but I am not sure we absolutely need to (we'd see if it is invalid / produces wrong results).
Thanks in particular to apaszke, fmassa, mruberry for advice and encouragement! All the errors are my own.
I have discussed order of PRs briefly with mruberry, if this goes in before he submits the PR, he graciously agreed to rebasing his, but I'd happily rebase, too.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/14336
Differential Revision: D13202620
Pulled By: soumith
fbshipit-source-id: 855159e261fa15f21aca3053bfc05fb3f720a8ef
2018-11-27 19:30:41 +00:00
|
|
|
// or all tensor inputs have the same scalar type and
|
|
|
|
|
// output is identified in PropagateInputShapes
|
2018-11-05 14:27:24 +00:00
|
|
|
// - Output and all tensor inputs should be on the same device
|
2019-10-31 20:19:31 +00:00
|
|
|
// - Produces dense non-overlapping outputs
|
Make cloneFrom() copy all metadata; use createClone() as much as possible.
To be honest, this was the whole point of this refactor set.
I noticed that in a lot of code, we were repeatedly copying lots of metadata
from old nodes to new nodes. This was quite concerning because I wanted to
add some more metadata (alias information) and I didn't want to have to
get it right in all cases. Plus, in a lot of cases we were forgetting
to set more optional properties like debug names when we "copied".
To solve this, I first made cloneFrom() copy all of this metadata. Then,
I searched for all occurrences of setType() (a proxy for "I'm cloning this
node), looked for cases where we really were morally doing a copy, and rewrote
the code to use cloneFrom() instead, allowing us to drop explicit setType()
(and getting more metadata preservation in the process.)
Finally, I refactored tryToMoveChunk. The code is modestly longer,
but the new version has the nice property that the initialization of
selects for input_chunk are next to the creation of the node (as opposed
to delayed for later.) I also added a lot more comments for invariants
I noticed when I was working on the code.
One minor extra change: TensorType grew a new constructor and a withSizesStride
"immutable setter" which returns a new copy of TensorType with different info.
Signed-off-by: Edward Z. Yang <ezyang@fb.com>
2017-09-14 04:57:05 +00:00
|
|
|
// Some of these restrictions may be relaxable, but you should
|
|
|
|
|
// carefully read the code first, as we rely on these assumptions.
|
2018-12-26 14:52:25 +00:00
|
|
|
bool isSimpleMap(Node* node) {
|
|
|
|
|
static OperatorSet simple_mappable{{
|
|
|
|
|
"aten::_cast_Float(Tensor self, bool non_blocking) -> Tensor",
|
|
|
|
|
|
|
|
|
|
"aten::abs(Tensor self) -> Tensor",
|
|
|
|
|
"aten::acos(Tensor self) -> Tensor",
|
|
|
|
|
"aten::add(Tensor self, Tensor other, *, Scalar alpha) -> Tensor",
|
|
|
|
|
"aten::asin(Tensor self) -> Tensor",
|
|
|
|
|
"aten::atan(Tensor self) -> Tensor",
|
|
|
|
|
"aten::atan2(Tensor self, Tensor other) -> Tensor",
|
|
|
|
|
"aten::ceil(Tensor self) -> Tensor",
|
|
|
|
|
"aten::clamp(Tensor self, Scalar? min, Scalar? max) -> Tensor",
|
|
|
|
|
"aten::cos(Tensor self) -> Tensor",
|
|
|
|
|
"aten::cosh(Tensor self) -> Tensor",
|
|
|
|
|
"aten::div(Tensor self, Tensor other) -> Tensor",
|
|
|
|
|
"aten::exp(Tensor self) -> Tensor",
|
|
|
|
|
"aten::expm1(Tensor self) -> Tensor",
|
|
|
|
|
"aten::erf(Tensor self) -> Tensor",
|
|
|
|
|
"aten::erfc(Tensor self) -> Tensor",
|
|
|
|
|
"aten::floor(Tensor self) -> Tensor",
|
|
|
|
|
"aten::fmod(Tensor self, Tensor other) -> Tensor",
|
|
|
|
|
"aten::frac(Tensor self) -> Tensor",
|
|
|
|
|
"aten::lgamma(Tensor self) -> Tensor",
|
|
|
|
|
"aten::log(Tensor self) -> Tensor",
|
|
|
|
|
"aten::log10(Tensor self) -> Tensor",
|
|
|
|
|
"aten::log1p(Tensor self) -> Tensor",
|
|
|
|
|
"aten::log2(Tensor self) -> Tensor",
|
2019-03-25 18:02:17 +00:00
|
|
|
"aten::lerp(Tensor self, Tensor end, Scalar weight) -> Tensor",
|
|
|
|
|
"aten::lerp(Tensor self, Tensor end, Tensor weight) -> Tensor",
|
2018-12-26 14:52:25 +00:00
|
|
|
"aten::max(Tensor self, Tensor other) -> Tensor",
|
|
|
|
|
"aten::min(Tensor self, Tensor other) -> Tensor",
|
|
|
|
|
"aten::mul(Tensor self, Tensor other) -> Tensor",
|
|
|
|
|
"aten::neg(Tensor self) -> Tensor",
|
|
|
|
|
"aten::pow(Tensor self, Tensor exponent) -> Tensor",
|
|
|
|
|
"aten::pow(Tensor self, Scalar exponent) -> Tensor",
|
2019-04-19 00:52:33 +00:00
|
|
|
"aten::pow(Scalar self, Tensor exponent) -> Tensor",
|
2018-12-26 14:52:25 +00:00
|
|
|
"aten::reciprocal(Tensor self) -> Tensor",
|
|
|
|
|
"aten::relu(Tensor self) -> Tensor",
|
2019-01-10 20:25:22 +00:00
|
|
|
"aten::threshold(Tensor self, Scalar threshold, Scalar value) -> Tensor",
|
2018-12-26 14:52:25 +00:00
|
|
|
"aten::remainder(Tensor self, Tensor other) -> Tensor",
|
|
|
|
|
"aten::round(Tensor self) -> Tensor",
|
|
|
|
|
"aten::rsqrt(Tensor self) -> Tensor",
|
|
|
|
|
"aten::sigmoid(Tensor self) -> Tensor",
|
|
|
|
|
"aten::sin(Tensor self) -> Tensor",
|
|
|
|
|
"aten::sinh(Tensor self) -> Tensor",
|
|
|
|
|
"aten::sqrt(Tensor self) -> Tensor",
|
|
|
|
|
"aten::sub(Tensor self, Tensor other, *, Scalar alpha) -> Tensor",
|
|
|
|
|
"aten::tan(Tensor self) -> Tensor",
|
2019-10-31 20:19:31 +00:00
|
|
|
"aten::rand_like(Tensor self, *, MemoryFormat? memory_format=None) -> Tensor",
|
2018-12-26 14:52:25 +00:00
|
|
|
"aten::tanh(Tensor self) -> Tensor",
|
|
|
|
|
"aten::trunc(Tensor self) -> Tensor",
|
|
|
|
|
"aten::add(Tensor self, Scalar other, Scalar alpha) -> Tensor",
|
|
|
|
|
"aten::sub(Tensor self, Scalar other, Scalar alpha) -> Tensor",
|
|
|
|
|
"aten::mul(Tensor self, Scalar other) -> Tensor",
|
|
|
|
|
"aten::div(Tensor self, Scalar other) -> Tensor",
|
|
|
|
|
|
|
|
|
|
"aten::eq(Tensor self, Tensor other) -> Tensor",
|
|
|
|
|
"aten::eq(Tensor self, Scalar other) -> Tensor",
|
|
|
|
|
"aten::ne(Tensor self, Tensor other) -> Tensor",
|
|
|
|
|
"aten::ne(Tensor self, Scalar other) -> Tensor",
|
|
|
|
|
"aten::ge(Tensor self, Tensor other) -> Tensor",
|
|
|
|
|
"aten::ge(Tensor self, Scalar other) -> Tensor",
|
|
|
|
|
"aten::gt(Tensor self, Tensor other) -> Tensor",
|
|
|
|
|
"aten::gt(Tensor self, Scalar other) -> Tensor",
|
|
|
|
|
"aten::le(Tensor self, Tensor other) -> Tensor",
|
|
|
|
|
"aten::le(Tensor self, Scalar other) -> Tensor",
|
|
|
|
|
"aten::lt(Tensor self, Tensor other) -> Tensor",
|
|
|
|
|
"aten::lt(Tensor self, Scalar other) -> Tensor",
|
|
|
|
|
|
2019-03-25 18:02:17 +00:00
|
|
|
"aten::addcmul(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor",
|
2018-12-26 14:52:25 +00:00
|
|
|
"aten::where(Tensor condition, Tensor self, Tensor other) -> Tensor",
|
|
|
|
|
|
|
|
|
|
"aten::type_as(Tensor self, Tensor other) -> Tensor",
|
2018-11-05 14:27:24 +00:00
|
|
|
}};
|
|
|
|
|
if (!simple_mappable.find(node)) {
|
2018-04-06 00:07:52 +00:00
|
|
|
return false;
|
2018-08-17 21:08:05 +00:00
|
|
|
}
|
2018-12-26 14:52:25 +00:00
|
|
|
for (Value* input : node->inputs()) {
|
2019-04-12 21:24:37 +00:00
|
|
|
if (input->type()->isSubtypeOf(TensorType::get()) ||
|
|
|
|
|
input->type()->isSubtypeOf(FloatType::get())) {
|
2018-11-05 14:27:24 +00:00
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
if (input->node()->kind() != prim::Constant) {
|
|
|
|
|
return false;
|
2018-08-17 21:08:05 +00:00
|
|
|
}
|
|
|
|
|
}
|
2018-11-05 14:27:24 +00:00
|
|
|
return true;
|
|
|
|
|
}
|
2018-06-14 13:30:25 +00:00
|
|
|
|
2018-12-26 14:52:25 +00:00
|
|
|
Value* broadcastSizes(at::ArrayRef<Value*> sizes) {
|
2019-01-24 19:05:07 +00:00
|
|
|
AT_ASSERT(!sizes.empty());
|
2018-12-26 14:52:25 +00:00
|
|
|
Graph* graph = sizes[0]->owningGraph();
|
|
|
|
|
Node* broadcast_n =
|
|
|
|
|
graph->insertNode(graph->create(prim::BroadcastSizes, sizes));
|
2018-12-04 23:40:41 +00:00
|
|
|
broadcast_n->output()->setType(ListType::ofInts());
|
|
|
|
|
return broadcast_n->output();
|
|
|
|
|
}
|
|
|
|
|
|
2017-07-19 23:10:13 +00:00
|
|
|
struct GraphFuser {
|
2019-05-07 06:11:58 +00:00
|
|
|
using FusionCallback = std::function<bool(Node*)>;
|
|
|
|
|
|
2018-12-26 14:52:25 +00:00
|
|
|
Block* block_;
|
2019-01-30 19:06:32 +00:00
|
|
|
std::unique_ptr<AliasDb> aliasDb_;
|
2018-11-22 01:46:46 +00:00
|
|
|
std::shared_ptr<Graph> graph_;
|
2019-05-07 06:11:58 +00:00
|
|
|
FusionCallback callback_ = [&](Node* n) { return isFusableDefault(n); };
|
|
|
|
|
Symbol kind_ = prim::FusionGroup;
|
2017-08-01 22:55:57 +00:00
|
|
|
|
2019-07-11 20:26:39 +00:00
|
|
|
// nvrtc has a limit on the number of arguments allowed in a CUDA kernel.
|
|
|
|
|
// The specific limit is a function of constant memory size, amount available
|
|
|
|
|
// to pass arguments, and some implementation dependence. Select a safe
|
|
|
|
|
// limit here.
|
|
|
|
|
// This limit is also applied to other devices in the fuser by default.
|
|
|
|
|
// Change with setInputArgLimit
|
|
|
|
|
size_t subgraph_arg_limit_ = 128;
|
|
|
|
|
|
2018-11-22 01:46:46 +00:00
|
|
|
GraphFuser(Block* block, std::shared_ptr<Graph> graph)
|
|
|
|
|
: block_(block), graph_(std::move(graph)) {}
|
2018-06-21 22:14:51 +00:00
|
|
|
|
2019-05-07 06:11:58 +00:00
|
|
|
// Custom passes require kind to specified
|
|
|
|
|
GraphFuser(
|
|
|
|
|
Block* block,
|
|
|
|
|
std::shared_ptr<Graph> graph,
|
|
|
|
|
FusionCallback callback,
|
|
|
|
|
Symbol kind)
|
|
|
|
|
: block_(block),
|
|
|
|
|
graph_(std::move(graph)),
|
|
|
|
|
callback_(callback),
|
|
|
|
|
kind_(kind) {}
|
|
|
|
|
|
2019-07-11 20:26:39 +00:00
|
|
|
void setInputArgLimit(size_t limit) {
|
|
|
|
|
subgraph_arg_limit_ = limit;
|
|
|
|
|
}
|
|
|
|
|
|
2018-12-26 14:52:25 +00:00
|
|
|
value_list tensorInputs(Node* node) {
|
|
|
|
|
return filter(node->inputs(), [](Value* v) {
|
Rename DynamicType -> TensorType (#16787)
Summary:
```
import json
from subprocess import check_call
from pprint import pprint
renames = {
'c10::TensorType': 'DimentionedTensorType',
'c10::DynamicType': 'TensorType',
'c10::TensorTypePtr': 'DimentionedTensorTypePtr',
'c10::DynamicTypePtr': 'TensorTypePtr',
'c10::TypeKind::DynamicType': 'TensorType',
'c10::TypeKind::TensorType': 'DimentionedTensorType',
}
entries = json.loads(open('compile_commands.json', 'r').read())
build = None
sources = []
for e in entries:
name = e['file']
if not ('jit' in name or 'ATen/core' in name):
continue
build = e['directory']
sources.append(name)
args = ['clang-rename', '-i', '-force', '-pl']
for name in sorted(renames.keys()):
args += ['-qualified-name={}'.format(name), '-new-name={}'.format(renames[name])]
for source in sources:
cmd = args + [source]
pprint(args)
check_call(cmd, cwd=build)
check_call(['git', 'stash', 'push', '-m', 'rename'])
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/16787
Differential Revision: D13974132
Pulled By: zdevito
fbshipit-source-id: 8368fd53e17cff83707bbe77f2d7aad74f8ce60e
2019-02-07 01:22:47 +00:00
|
|
|
return v->type()->isSubtypeOf(TensorType::get());
|
2018-08-17 21:08:05 +00:00
|
|
|
});
|
|
|
|
|
}
|
Unify IR operator representation (stop using attributes in the JIT) (#9807)
Summary:
Based on top of #9763 (first 3 commits belong to that PR). The first commits from this PR are "Stop using attributes ..."
I tried to separate the changes into fairly meaningful commits. I can't split them up into smaller PRs, because everything starts working and all tests pass only after the whole sequence, but hopefully this will make reviewing somewhat easier.
Known issues/regressions/future tasks:
- `aten::lerp` and `aten::clamp` are no longer fusable
- `CreateAutodiffSubgraphs` needs a rewrite
- It is much more strict now, and will miss a lot of opportunities, especially when viewing ops are involved. Our previous approach was "ignore the assumption on shape availability in gradient formulas to determine differentiability, and hope that shape prop will be robust enough to actually deliver them before we differentiate", which obviously doesn't scale well to more complex cases. We should either work on reducing the size dependency of grad formulas (feasible e.g. for `view`/`reshape`, unfeasible for `squeeze`/`unsqueeze`), or make `CreateAutodiffSubgraphs` integrate some kind of "I could integrate this node into an AD subgraph, but will I be able to infer the shape of its input" reasoning (kind of like a limited shape prop, that doesn't infer anything, and only tells if it *could* infer something).
- It sometimes creates constant-only (or constants + one node) graphs, which is useless
- Broken `aten::add` in auto-batching, because it gained a non-tensor input. I changed the test for pointwise operations to use `aten::mul` instead, but I needed to disable the LSTM cell test. I'm not sure how scalar constants should be implemented in this case, because I don't fully understand our format. cc: ChunliF
- Graph import does some hacks to recover type of constants. This code should be removed once we'll gain the ability to export the IR along with value types.
- There's still a fair amount of dead code that can be removed. I didn't want to make this diff any bigger, and removing it is an easy task.
- Graph fuser could be improved to use signature matching (possibly using `OperatorSet`) instead of basing on node kinds.
- Manual constant propagation for the `ListConstruct` node in `torch/onnx/utils.py` should be replaced with a proper constant propagation pass (or we should ensure that the one we have handles at least this case before we remove this code).
zdevito
Pull Request resolved: https://github.com/pytorch/pytorch/pull/9807
Reviewed By: ezyang
Differential Revision: D9004285
Pulled By: apaszke
fbshipit-source-id: fe88026a765f6b687354add034c86402362508b7
2018-07-27 05:03:44 +00:00
|
|
|
|
2018-12-26 14:52:25 +00:00
|
|
|
bool isFusable(Node* node) {
|
2019-05-07 06:11:58 +00:00
|
|
|
return callback_(node);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool isFusableDevice(Value *v) {
|
2019-08-02 04:12:47 +00:00
|
|
|
if (!v->type()->isSubtypeOf(TensorType::get())) {
|
2019-05-07 06:11:58 +00:00
|
|
|
return true;
|
|
|
|
|
}
|
2019-08-20 19:57:40 +00:00
|
|
|
auto device = v->type()->expect<TensorType>()->device();
|
2019-08-02 04:12:47 +00:00
|
|
|
if (!device) {
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
if ((*device).is_cpu()) {
|
2019-05-07 06:11:58 +00:00
|
|
|
return canFuseOnCPU();
|
2019-08-02 04:12:47 +00:00
|
|
|
} else if ((*device).is_cuda()) {
|
2019-05-07 06:11:58 +00:00
|
|
|
return canFuseOnGPU();
|
|
|
|
|
}
|
|
|
|
|
throw std::runtime_error("Unknown device");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Default fusability check - used when the user doesn't pass in
|
|
|
|
|
// a callback.
|
|
|
|
|
bool isFusableDefault(Node* node) {
|
|
|
|
|
bool fusableDevice = true;
|
|
|
|
|
for (const auto& output : node->outputs()) {
|
|
|
|
|
if (output->uses().size() > 0) {
|
|
|
|
|
fusableDevice &= isFusableDevice(output);
|
|
|
|
|
}
|
|
|
|
|
}
|
2019-05-23 00:52:16 +00:00
|
|
|
return fusableDevice && isFusableMap(node);
|
2019-01-10 20:25:22 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool isFusableMap(Node* node) {
|
2018-11-05 14:27:24 +00:00
|
|
|
// We don't want to bother with cross-block node movements, as they
|
|
|
|
|
// are not necessarily correct.
|
2018-12-26 14:52:25 +00:00
|
|
|
if (node->owningBlock() != block_)
|
|
|
|
|
return false;
|
2018-11-05 14:27:24 +00:00
|
|
|
return node->kind() == prim::FusionGroup || isSimpleMap(node);
|
2017-07-21 07:03:45 +00:00
|
|
|
}
|
2017-07-24 01:38:55 +00:00
|
|
|
|
2018-12-26 14:52:25 +00:00
|
|
|
bool isFusableCatNode(Node* node) {
|
2018-07-31 14:29:34 +00:00
|
|
|
if (node->kind() != aten::cat)
|
|
|
|
|
return false;
|
|
|
|
|
if (!node->is_constant(attr::dim))
|
|
|
|
|
return false;
|
2019-04-10 05:29:33 +00:00
|
|
|
|
2018-07-31 14:29:34 +00:00
|
|
|
auto tensors_node = node->namedInput(attr::tensors)->node();
|
2019-05-06 19:33:45 +00:00
|
|
|
if ((tensors_node->inputs().size() + node->outputs().size()) >
|
2019-07-11 20:26:39 +00:00
|
|
|
subgraph_arg_limit_) {
|
2019-04-10 05:29:33 +00:00
|
|
|
return false;
|
|
|
|
|
}
|
2018-12-26 14:52:25 +00:00
|
|
|
if (tensors_node->kind() != prim::ListConstruct)
|
|
|
|
|
return false;
|
|
|
|
|
// NB: Note that technically other uses of the list aren't a big problem for
|
|
|
|
|
// us. It would be enough to place the prim::FusedConcat before the
|
|
|
|
|
// prim::ListConstruct, and allUsersAreThisConsumerOrOccurAfterIt would
|
|
|
|
|
// still be satisfied. However, I don't expect this to be necessary any time
|
|
|
|
|
// soon, and so we're simply assuming that we don't have to deal with it.
|
|
|
|
|
if (tensors_node->output()->uses().size() > 1)
|
|
|
|
|
return false;
|
Prevent JIT from overspecializing to every single size configuration (#10844)
Summary:
Please review the expects carefully to make sure there are no regressions. I tried to go over them one by one when they changed, but it's sometimes easy to miss finer details.
Summary of changes:
- Renamed `TensorType` to `CompleteTensorType`. Added a new `TensorType` which records only the scalar type, number of dimensions, and device of a value. The argument behind the rename is to encourage people to use `CompleteTensorType` less, as most passes will only have limited information available. To make transition easier `complete_type->cast<TensorType>()` works, and makes our passes work with both kinds of specialization if they don't need extra the extra detail.
- Renamed `ArgumentSpec` to `CompleteArgumentSpec`. Added a new `ArgumentSpec`, which matches argument only at the level of the new `TensorType`.
- Shape analysis can process graphs with both `CompleteTensorType` and `TensorType`.
- Fuser was a part that heavily relied on full shape information being available. Now, we simply try to fuse the largest possible graphs, and have to do run-time checks to make sure they match the code we generate. If they don't, we fall back to regular interpretation. The shape checks are implementing using an optimized method exploiting algebraic properties of shapes with broadcasting, and the relations of broadcasting with pointwise ops. A full written proof of correctness of the shape checking algorithm is included in a comment in `graph_fuser.cpp`.
zdevito ezyang mruberry ngimel csarofeen
Pull Request resolved: https://github.com/pytorch/pytorch/pull/10844
Differential Revision: D9498705
Pulled By: apaszke
fbshipit-source-id: 0c53c2fcebd871cc2a29c260f8d012276479cc61
2018-08-26 16:40:58 +00:00
|
|
|
return true;
|
2018-03-02 23:03:44 +00:00
|
|
|
}
|
|
|
|
|
|
2018-12-26 14:52:25 +00:00
|
|
|
bool calculatesSize(Node* node) {
|
2018-12-04 23:40:41 +00:00
|
|
|
return node->matches("aten::size(Tensor self) -> int[]");
|
|
|
|
|
}
|
|
|
|
|
|
2018-12-26 14:52:25 +00:00
|
|
|
bool allUsersAreThisConsumerOrCalcSizes(Node* consumer, Value* producer) {
|
2017-12-20 15:36:22 +00:00
|
|
|
auto defining_node = producer->node();
|
2018-12-26 14:52:25 +00:00
|
|
|
for (auto o : defining_node->outputs()) {
|
|
|
|
|
for (auto u : o->uses()) {
|
|
|
|
|
if (u.user != consumer && !calculatesSize(u.user))
|
2017-12-20 15:36:22 +00:00
|
|
|
return false;
|
|
|
|
|
}
|
2017-08-01 04:46:41 +00:00
|
|
|
}
|
|
|
|
|
return true;
|
|
|
|
|
}
|
2017-07-24 01:38:55 +00:00
|
|
|
|
2018-12-26 14:52:25 +00:00
|
|
|
Graph& getSubgraph(Node* n) {
|
2019-05-07 06:11:58 +00:00
|
|
|
AT_ASSERT(n->kind() == kind_);
|
Namespaced symbols (#5820)
* Namespaced symbols
- Our interned strings now have structure, "ns::symname" rather than just
"symname" before. We support efficient namespace testing for uniques
by encoding the namespace in one byte in the Symbol internal representation.
See torch/csrc/jit/interned_strings.h for a more in-depth implementation
discussion.
- All uses of ksymbol are now attr::symbol (or some appropriate namespace).
The valid namespaces are prim, attr, onnx and aten.
- Symbol is bound in Python as a qualified string "attr::symbol", EXCEPT for the
attribute setting/getting API, whose symbols must always be attr
symbols; they get special cased to assume strings are passed.
There's a little bit of naughtiness in the implementation, maybe you know
how to solve it.
- However, the g.op() convenience function assumes that you're generating
ONNX operators, unless you explicitly qualify.
- All ATen operators and nodes have built-in interned strings generated
for them, so you should never have to write a string literal ever again.
The tracing code is adjusted to use it.
- ONNX exporter now properly tests to see that all operators are in
onnx namespace before accepting the export. This is way more
robust than the previous exporter, which would be willing to
export capitalized operators which were not actually ONNX operators.
- A slight organizational change for symbolic.py; this module now ONLY
contains aten operators. In particular, the exporter for Constant
has moved into utils.py (along with Undefined, from the C++ side),
since primitive ops get "special treatment."
- The un-inplacing logic in recording is more robust, so that we don't
delete a trailing underscore from __and__. This never affected us
before because we didn't have any tests for it.
Signed-off-by: Edward Z. Yang <ezyang@fb.com>
2018-03-16 17:36:11 +00:00
|
|
|
return *n->g(attr::Subgraph);
|
2017-08-17 08:46:38 +00:00
|
|
|
}
|
2017-11-26 20:26:45 +00:00
|
|
|
|
2018-12-26 14:52:25 +00:00
|
|
|
void mergeFusionGroups(Node* consumer_group, Node* producer_group) {
|
2017-11-26 20:26:45 +00:00
|
|
|
// Now we have two fusion groups!
|
2018-12-26 14:52:25 +00:00
|
|
|
// Revert the fusion - place all inner nodes of producer back in the outer
|
|
|
|
|
// graph.
|
2017-11-26 20:26:45 +00:00
|
|
|
std::vector<Node*> temporary_nodes;
|
|
|
|
|
auto producer_subgraph = &getSubgraph(producer_group);
|
|
|
|
|
|
|
|
|
|
// Initialize a map of inner graph values to outer graph values
|
|
|
|
|
std::unordered_map<Value*, Value*> inner_to_outer;
|
|
|
|
|
auto inner_inputs = producer_subgraph->inputs();
|
|
|
|
|
auto outer_inputs = producer_group->inputs();
|
2018-06-04 15:10:44 +00:00
|
|
|
for (size_t i = 0; i < inner_inputs.size(); ++i) {
|
2017-11-26 20:26:45 +00:00
|
|
|
inner_to_outer[inner_inputs[i]] = outer_inputs[i];
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Clone all nodes
|
2018-02-01 21:56:48 +00:00
|
|
|
for (auto inner : producer_subgraph->nodes()) {
|
2018-12-26 14:52:25 +00:00
|
|
|
Node* outer = block_->owningGraph()->createClone(
|
|
|
|
|
inner, [&](Value* k) -> Value* { return inner_to_outer.at(k); });
|
2017-11-26 20:26:45 +00:00
|
|
|
outer->insertBefore(producer_group);
|
|
|
|
|
temporary_nodes.emplace_back(outer);
|
|
|
|
|
auto inner_outputs = inner->outputs();
|
|
|
|
|
auto outer_outputs = outer->outputs();
|
2018-06-04 15:10:44 +00:00
|
|
|
for (size_t i = 0; i < inner_outputs.size(); ++i)
|
2017-11-26 20:26:45 +00:00
|
|
|
inner_to_outer[inner_outputs[i]] = outer_outputs[i];
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Replace uses of producer_group outputs and destroy the producer
|
|
|
|
|
auto subgraph_outputs = producer_subgraph->outputs();
|
2018-06-04 15:10:44 +00:00
|
|
|
for (size_t i = 0; i < subgraph_outputs.size(); ++i) {
|
2017-11-26 20:26:45 +00:00
|
|
|
auto outer_output = inner_to_outer.at(subgraph_outputs[i]);
|
|
|
|
|
producer_group->outputs()[i]->replaceAllUsesWith(outer_output);
|
|
|
|
|
}
|
|
|
|
|
producer_group->destroy();
|
2018-12-26 14:52:25 +00:00
|
|
|
producer_group =
|
|
|
|
|
nullptr; // Just to get a clear error in case someone uses it
|
2017-11-26 20:26:45 +00:00
|
|
|
|
|
|
|
|
// Inline the temporary nodes into the first group
|
|
|
|
|
auto consumer_subgraph = &getSubgraph(consumer_group);
|
2018-12-26 14:52:25 +00:00
|
|
|
for (auto it = temporary_nodes.rbegin(); it != temporary_nodes.rend();
|
|
|
|
|
++it) {
|
|
|
|
|
Node* node = *it;
|
|
|
|
|
Node* merged = mergeNodeIntoGroup(consumer_group, node);
|
2017-11-26 20:26:45 +00:00
|
|
|
// If any of the outputs are still used then we need to add them
|
|
|
|
|
auto outputs = node->outputs();
|
2018-06-04 15:10:44 +00:00
|
|
|
for (size_t i = 0; i < outputs.size(); ++i) {
|
2017-11-26 20:26:45 +00:00
|
|
|
auto output = outputs[i];
|
2018-12-26 14:52:25 +00:00
|
|
|
if (output->uses().size() == 0)
|
|
|
|
|
continue;
|
2017-11-26 20:26:45 +00:00
|
|
|
consumer_subgraph->registerOutput(merged->outputs()[i]);
|
|
|
|
|
auto new_output = consumer_group->addOutput();
|
|
|
|
|
output->replaceAllUsesWith(new_output);
|
2018-02-26 23:24:05 +00:00
|
|
|
new_output->setType(output->type());
|
2017-11-26 20:26:45 +00:00
|
|
|
}
|
|
|
|
|
node->destroy();
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
Prevent JIT from overspecializing to every single size configuration (#10844)
Summary:
Please review the expects carefully to make sure there are no regressions. I tried to go over them one by one when they changed, but it's sometimes easy to miss finer details.
Summary of changes:
- Renamed `TensorType` to `CompleteTensorType`. Added a new `TensorType` which records only the scalar type, number of dimensions, and device of a value. The argument behind the rename is to encourage people to use `CompleteTensorType` less, as most passes will only have limited information available. To make transition easier `complete_type->cast<TensorType>()` works, and makes our passes work with both kinds of specialization if they don't need extra the extra detail.
- Renamed `ArgumentSpec` to `CompleteArgumentSpec`. Added a new `ArgumentSpec`, which matches argument only at the level of the new `TensorType`.
- Shape analysis can process graphs with both `CompleteTensorType` and `TensorType`.
- Fuser was a part that heavily relied on full shape information being available. Now, we simply try to fuse the largest possible graphs, and have to do run-time checks to make sure they match the code we generate. If they don't, we fall back to regular interpretation. The shape checks are implementing using an optimized method exploiting algebraic properties of shapes with broadcasting, and the relations of broadcasting with pointwise ops. A full written proof of correctness of the shape checking algorithm is included in a comment in `graph_fuser.cpp`.
zdevito ezyang mruberry ngimel csarofeen
Pull Request resolved: https://github.com/pytorch/pytorch/pull/10844
Differential Revision: D9498705
Pulled By: apaszke
fbshipit-source-id: 0c53c2fcebd871cc2a29c260f8d012276479cc61
2018-08-26 16:40:58 +00:00
|
|
|
// insert a producer node into a consuming fusion group.
|
|
|
|
|
// DOES NOT WORK if n is a consumer of an output of the fusion group
|
|
|
|
|
// returns the node _inside_ the group that represents the node
|
2018-12-26 14:52:25 +00:00
|
|
|
Node* mergeNodeIntoGroup(Node* group, Node* n) {
|
2019-05-07 06:11:58 +00:00
|
|
|
AT_ASSERT(n->kind() != kind_);
|
2018-12-26 14:52:25 +00:00
|
|
|
auto& subgraph = getSubgraph(group);
|
2017-07-21 18:42:30 +00:00
|
|
|
// map from nodes in the surrounding graph to parameters in the fusion
|
|
|
|
|
// group's subgraph that correspond to them
|
2018-12-26 14:52:25 +00:00
|
|
|
std::unordered_map<Value*, Value*> inputs_map;
|
2017-07-21 07:03:45 +00:00
|
|
|
size_t i = 0;
|
Get more fusion after autodiff uses SumToSize (#14957)
Summary:
Here is a fresh attempt at getting some fusion back in autodiff-generated graphs in the presence of SumToSize.
- The sum to size operator is now `aten::_grad_sum_to_size` to allow symbolic script differentiation (and that in turn would need to use this in place of sum_to_size to signal that it strictly operates on gradients). This is also used in the autodiff code, replacing `prim::SumToSize`.
- `_grad_sum_to_size` is now fusable, `cat`s - which are fused afterwards thanks to Adam's simplification of the code - are only fused if there is no `_grad_sum_to_size` in the fusion group.
- I push the `_grad_sum_to_size` out of the the fusion group when compiling and record the desired summations in the KernelSpec. The reasoning is the following:
- As the autodiff is a repeated applicaiton of the chain rule, we always have the pattern `grad_in = mm(A, grad_out)`, with A often diagonal for cases interesting to the fuser, whence it is `grad_in = a * grad_out` (a pointwise multiplication). We know that only `grad_out` may have AutodiffGradSumToSize applied, so we can commute AutodiffGradSumToSize with the `mul` (and `div` and `neg` are of similar origin).
- For `type_as` the gradient might be giving the type, so just skip SumToSize,
- `add` (which was inserted as `prim::AutogradAdd`) adding gradients when the forward used the same value in several places. This is non-broadcasting, so we know that the two arguments would have the same sizes as inputs - which is good so we don't have to do bookkeeping of the two parts.
Details:
- During fusion, the Tensor arguments are always kept as the first parameters of the fusion group to accomodate indexing assumptions in the fuser.
- The rewriting of the fusion group to record the necessary output transformation and eliminate `_grad_sum_to_size` from the fusion group is now in the fuser compile step.
- In the execution step, the arguments are split into Tensor / Non-Tensor and the non-tensor args are mostly forgotten about except for doing `sum_to_size` at the end. This would want to be improved if/when we fuse nonconstant scalar arguments.
- In a number of places in the fuser, the non-Tensor arguments to the fusion group needed to be ignored.
Thank you, apaszke for the insightful discussion. All bad ideas and errors are my own.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/14957
Differential Revision: D13888173
Pulled By: zou3519
fbshipit-source-id: 071992c876e8b845f2b3e6329ae03a835d39a0ea
2019-01-31 19:57:56 +00:00
|
|
|
size_t tensor_insert_idx = 0;
|
2019-01-24 19:05:07 +00:00
|
|
|
AT_ASSERT(group->inputs().size() == subgraph.inputs().size());
|
2018-12-26 14:52:25 +00:00
|
|
|
for (auto input : group->inputs()) {
|
2017-07-21 07:03:45 +00:00
|
|
|
inputs_map[input] = subgraph.inputs()[i++];
|
Rename DynamicType -> TensorType (#16787)
Summary:
```
import json
from subprocess import check_call
from pprint import pprint
renames = {
'c10::TensorType': 'DimentionedTensorType',
'c10::DynamicType': 'TensorType',
'c10::TensorTypePtr': 'DimentionedTensorTypePtr',
'c10::DynamicTypePtr': 'TensorTypePtr',
'c10::TypeKind::DynamicType': 'TensorType',
'c10::TypeKind::TensorType': 'DimentionedTensorType',
}
entries = json.loads(open('compile_commands.json', 'r').read())
build = None
sources = []
for e in entries:
name = e['file']
if not ('jit' in name or 'ATen/core' in name):
continue
build = e['directory']
sources.append(name)
args = ['clang-rename', '-i', '-force', '-pl']
for name in sorted(renames.keys()):
args += ['-qualified-name={}'.format(name), '-new-name={}'.format(renames[name])]
for source in sources:
cmd = args + [source]
pprint(args)
check_call(cmd, cwd=build)
check_call(['git', 'stash', 'push', '-m', 'rename'])
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/16787
Differential Revision: D13974132
Pulled By: zdevito
fbshipit-source-id: 8368fd53e17cff83707bbe77f2d7aad74f8ce60e
2019-02-07 01:22:47 +00:00
|
|
|
if (input->type()->isSubtypeOf(TensorType::get()))
|
Get more fusion after autodiff uses SumToSize (#14957)
Summary:
Here is a fresh attempt at getting some fusion back in autodiff-generated graphs in the presence of SumToSize.
- The sum to size operator is now `aten::_grad_sum_to_size` to allow symbolic script differentiation (and that in turn would need to use this in place of sum_to_size to signal that it strictly operates on gradients). This is also used in the autodiff code, replacing `prim::SumToSize`.
- `_grad_sum_to_size` is now fusable, `cat`s - which are fused afterwards thanks to Adam's simplification of the code - are only fused if there is no `_grad_sum_to_size` in the fusion group.
- I push the `_grad_sum_to_size` out of the the fusion group when compiling and record the desired summations in the KernelSpec. The reasoning is the following:
- As the autodiff is a repeated applicaiton of the chain rule, we always have the pattern `grad_in = mm(A, grad_out)`, with A often diagonal for cases interesting to the fuser, whence it is `grad_in = a * grad_out` (a pointwise multiplication). We know that only `grad_out` may have AutodiffGradSumToSize applied, so we can commute AutodiffGradSumToSize with the `mul` (and `div` and `neg` are of similar origin).
- For `type_as` the gradient might be giving the type, so just skip SumToSize,
- `add` (which was inserted as `prim::AutogradAdd`) adding gradients when the forward used the same value in several places. This is non-broadcasting, so we know that the two arguments would have the same sizes as inputs - which is good so we don't have to do bookkeeping of the two parts.
Details:
- During fusion, the Tensor arguments are always kept as the first parameters of the fusion group to accomodate indexing assumptions in the fuser.
- The rewriting of the fusion group to record the necessary output transformation and eliminate `_grad_sum_to_size` from the fusion group is now in the fuser compile step.
- In the execution step, the arguments are split into Tensor / Non-Tensor and the non-tensor args are mostly forgotten about except for doing `sum_to_size` at the end. This would want to be improved if/when we fuse nonconstant scalar arguments.
- In a number of places in the fuser, the non-Tensor arguments to the fusion group needed to be ignored.
Thank you, apaszke for the insightful discussion. All bad ideas and errors are my own.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/14957
Differential Revision: D13888173
Pulled By: zou3519
fbshipit-source-id: 071992c876e8b845f2b3e6329ae03a835d39a0ea
2019-01-31 19:57:56 +00:00
|
|
|
tensor_insert_idx = i;
|
2017-07-21 07:03:45 +00:00
|
|
|
}
|
2018-12-26 14:52:25 +00:00
|
|
|
// add n's inputs to the fusion group's input list if we don't already have
|
|
|
|
|
// them
|
Get more fusion after autodiff uses SumToSize (#14957)
Summary:
Here is a fresh attempt at getting some fusion back in autodiff-generated graphs in the presence of SumToSize.
- The sum to size operator is now `aten::_grad_sum_to_size` to allow symbolic script differentiation (and that in turn would need to use this in place of sum_to_size to signal that it strictly operates on gradients). This is also used in the autodiff code, replacing `prim::SumToSize`.
- `_grad_sum_to_size` is now fusable, `cat`s - which are fused afterwards thanks to Adam's simplification of the code - are only fused if there is no `_grad_sum_to_size` in the fusion group.
- I push the `_grad_sum_to_size` out of the the fusion group when compiling and record the desired summations in the KernelSpec. The reasoning is the following:
- As the autodiff is a repeated applicaiton of the chain rule, we always have the pattern `grad_in = mm(A, grad_out)`, with A often diagonal for cases interesting to the fuser, whence it is `grad_in = a * grad_out` (a pointwise multiplication). We know that only `grad_out` may have AutodiffGradSumToSize applied, so we can commute AutodiffGradSumToSize with the `mul` (and `div` and `neg` are of similar origin).
- For `type_as` the gradient might be giving the type, so just skip SumToSize,
- `add` (which was inserted as `prim::AutogradAdd`) adding gradients when the forward used the same value in several places. This is non-broadcasting, so we know that the two arguments would have the same sizes as inputs - which is good so we don't have to do bookkeeping of the two parts.
Details:
- During fusion, the Tensor arguments are always kept as the first parameters of the fusion group to accomodate indexing assumptions in the fuser.
- The rewriting of the fusion group to record the necessary output transformation and eliminate `_grad_sum_to_size` from the fusion group is now in the fuser compile step.
- In the execution step, the arguments are split into Tensor / Non-Tensor and the non-tensor args are mostly forgotten about except for doing `sum_to_size` at the end. This would want to be improved if/when we fuse nonconstant scalar arguments.
- In a number of places in the fuser, the non-Tensor arguments to the fusion group needed to be ignored.
Thank you, apaszke for the insightful discussion. All bad ideas and errors are my own.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/14957
Differential Revision: D13888173
Pulled By: zou3519
fbshipit-source-id: 071992c876e8b845f2b3e6329ae03a835d39a0ea
2019-01-31 19:57:56 +00:00
|
|
|
// we insert tensors first because the fuser assumes that to be the case
|
|
|
|
|
// (as a legacy from tensors only)
|
2018-08-17 21:08:05 +00:00
|
|
|
WithInsertPoint guard(*subgraph.nodes().begin());
|
Make cloneFrom() copy all metadata; use createClone() as much as possible.
To be honest, this was the whole point of this refactor set.
I noticed that in a lot of code, we were repeatedly copying lots of metadata
from old nodes to new nodes. This was quite concerning because I wanted to
add some more metadata (alias information) and I didn't want to have to
get it right in all cases. Plus, in a lot of cases we were forgetting
to set more optional properties like debug names when we "copied".
To solve this, I first made cloneFrom() copy all of this metadata. Then,
I searched for all occurrences of setType() (a proxy for "I'm cloning this
node), looked for cases where we really were morally doing a copy, and rewrote
the code to use cloneFrom() instead, allowing us to drop explicit setType()
(and getting more metadata preservation in the process.)
Finally, I refactored tryToMoveChunk. The code is modestly longer,
but the new version has the nice property that the initialization of
selects for input_chunk are next to the creation of the node (as opposed
to delayed for later.) I also added a lot more comments for invariants
I noticed when I was working on the code.
One minor extra change: TensorType grew a new constructor and a withSizesStride
"immutable setter" which returns a new copy of TensorType with different info.
Signed-off-by: Edward Z. Yang <ezyang@fb.com>
2017-09-14 04:57:05 +00:00
|
|
|
for (auto input : n->inputs()) {
|
|
|
|
|
if (inputs_map.count(input) == 0) {
|
Rename DynamicType -> TensorType (#16787)
Summary:
```
import json
from subprocess import check_call
from pprint import pprint
renames = {
'c10::TensorType': 'DimentionedTensorType',
'c10::DynamicType': 'TensorType',
'c10::TensorTypePtr': 'DimentionedTensorTypePtr',
'c10::DynamicTypePtr': 'TensorTypePtr',
'c10::TypeKind::DynamicType': 'TensorType',
'c10::TypeKind::TensorType': 'DimentionedTensorType',
}
entries = json.loads(open('compile_commands.json', 'r').read())
build = None
sources = []
for e in entries:
name = e['file']
if not ('jit' in name or 'ATen/core' in name):
continue
build = e['directory']
sources.append(name)
args = ['clang-rename', '-i', '-force', '-pl']
for name in sorted(renames.keys()):
args += ['-qualified-name={}'.format(name), '-new-name={}'.format(renames[name])]
for source in sources:
cmd = args + [source]
pprint(args)
check_call(cmd, cwd=build)
check_call(['git', 'stash', 'push', '-m', 'rename'])
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/16787
Differential Revision: D13974132
Pulled By: zdevito
fbshipit-source-id: 8368fd53e17cff83707bbe77f2d7aad74f8ce60e
2019-02-07 01:22:47 +00:00
|
|
|
if (input->type()->isSubtypeOf(TensorType::get())) {
|
Get more fusion after autodiff uses SumToSize (#14957)
Summary:
Here is a fresh attempt at getting some fusion back in autodiff-generated graphs in the presence of SumToSize.
- The sum to size operator is now `aten::_grad_sum_to_size` to allow symbolic script differentiation (and that in turn would need to use this in place of sum_to_size to signal that it strictly operates on gradients). This is also used in the autodiff code, replacing `prim::SumToSize`.
- `_grad_sum_to_size` is now fusable, `cat`s - which are fused afterwards thanks to Adam's simplification of the code - are only fused if there is no `_grad_sum_to_size` in the fusion group.
- I push the `_grad_sum_to_size` out of the the fusion group when compiling and record the desired summations in the KernelSpec. The reasoning is the following:
- As the autodiff is a repeated applicaiton of the chain rule, we always have the pattern `grad_in = mm(A, grad_out)`, with A often diagonal for cases interesting to the fuser, whence it is `grad_in = a * grad_out` (a pointwise multiplication). We know that only `grad_out` may have AutodiffGradSumToSize applied, so we can commute AutodiffGradSumToSize with the `mul` (and `div` and `neg` are of similar origin).
- For `type_as` the gradient might be giving the type, so just skip SumToSize,
- `add` (which was inserted as `prim::AutogradAdd`) adding gradients when the forward used the same value in several places. This is non-broadcasting, so we know that the two arguments would have the same sizes as inputs - which is good so we don't have to do bookkeeping of the two parts.
Details:
- During fusion, the Tensor arguments are always kept as the first parameters of the fusion group to accomodate indexing assumptions in the fuser.
- The rewriting of the fusion group to record the necessary output transformation and eliminate `_grad_sum_to_size` from the fusion group is now in the fuser compile step.
- In the execution step, the arguments are split into Tensor / Non-Tensor and the non-tensor args are mostly forgotten about except for doing `sum_to_size` at the end. This would want to be improved if/when we fuse nonconstant scalar arguments.
- In a number of places in the fuser, the non-Tensor arguments to the fusion group needed to be ignored.
Thank you, apaszke for the insightful discussion. All bad ideas and errors are my own.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/14957
Differential Revision: D13888173
Pulled By: zou3519
fbshipit-source-id: 071992c876e8b845f2b3e6329ae03a835d39a0ea
2019-01-31 19:57:56 +00:00
|
|
|
auto in_group = subgraph.insertInput(tensor_insert_idx);
|
|
|
|
|
in_group->setType(input->type());
|
|
|
|
|
inputs_map[input] = in_group;
|
|
|
|
|
group->insertInput(tensor_insert_idx, input);
|
|
|
|
|
tensor_insert_idx++;
|
|
|
|
|
} else if (
|
2019-04-12 21:24:37 +00:00
|
|
|
(input->type()->isSubtypeOf(FloatType::get()) &&
|
|
|
|
|
input->node()->kind() != prim::Constant) ||
|
|
|
|
|
(n->kind() == aten::_grad_sum_to_size &&
|
|
|
|
|
input->type()->isSubtypeOf(ListType::ofInts()))) {
|
Unify IR operator representation (stop using attributes in the JIT) (#9807)
Summary:
Based on top of #9763 (first 3 commits belong to that PR). The first commits from this PR are "Stop using attributes ..."
I tried to separate the changes into fairly meaningful commits. I can't split them up into smaller PRs, because everything starts working and all tests pass only after the whole sequence, but hopefully this will make reviewing somewhat easier.
Known issues/regressions/future tasks:
- `aten::lerp` and `aten::clamp` are no longer fusable
- `CreateAutodiffSubgraphs` needs a rewrite
- It is much more strict now, and will miss a lot of opportunities, especially when viewing ops are involved. Our previous approach was "ignore the assumption on shape availability in gradient formulas to determine differentiability, and hope that shape prop will be robust enough to actually deliver them before we differentiate", which obviously doesn't scale well to more complex cases. We should either work on reducing the size dependency of grad formulas (feasible e.g. for `view`/`reshape`, unfeasible for `squeeze`/`unsqueeze`), or make `CreateAutodiffSubgraphs` integrate some kind of "I could integrate this node into an AD subgraph, but will I be able to infer the shape of its input" reasoning (kind of like a limited shape prop, that doesn't infer anything, and only tells if it *could* infer something).
- It sometimes creates constant-only (or constants + one node) graphs, which is useless
- Broken `aten::add` in auto-batching, because it gained a non-tensor input. I changed the test for pointwise operations to use `aten::mul` instead, but I needed to disable the LSTM cell test. I'm not sure how scalar constants should be implemented in this case, because I don't fully understand our format. cc: ChunliF
- Graph import does some hacks to recover type of constants. This code should be removed once we'll gain the ability to export the IR along with value types.
- There's still a fair amount of dead code that can be removed. I didn't want to make this diff any bigger, and removing it is an easy task.
- Graph fuser could be improved to use signature matching (possibly using `OperatorSet`) instead of basing on node kinds.
- Manual constant propagation for the `ListConstruct` node in `torch/onnx/utils.py` should be replaced with a proper constant propagation pass (or we should ensure that the one we have handles at least this case before we remove this code).
zdevito
Pull Request resolved: https://github.com/pytorch/pytorch/pull/9807
Reviewed By: ezyang
Differential Revision: D9004285
Pulled By: apaszke
fbshipit-source-id: fe88026a765f6b687354add034c86402362508b7
2018-07-27 05:03:44 +00:00
|
|
|
auto in_group = subgraph.addInput();
|
|
|
|
|
in_group->setType(input->type());
|
|
|
|
|
inputs_map[input] = in_group;
|
|
|
|
|
group->addInput(input);
|
|
|
|
|
} else {
|
2018-12-26 14:52:25 +00:00
|
|
|
// We don't support passing in scalars as arguments to fused kernels,
|
|
|
|
|
// so we generally don't allow fusing tensor-scalar operations unless
|
|
|
|
|
// the scalar is constant. In those cases we inline the constants
|
|
|
|
|
// directly in the body of the fused group.
|
2019-01-24 19:05:07 +00:00
|
|
|
AT_ASSERT(input->node()->kind() == prim::Constant);
|
2018-12-26 14:52:25 +00:00
|
|
|
Node* in_const =
|
|
|
|
|
subgraph.createClone(input->node(), [](Value*) -> Value* {
|
|
|
|
|
throw std::runtime_error("unexpected input");
|
|
|
|
|
});
|
2018-08-17 21:08:05 +00:00
|
|
|
subgraph.insertNode(in_const);
|
Unify IR operator representation (stop using attributes in the JIT) (#9807)
Summary:
Based on top of #9763 (first 3 commits belong to that PR). The first commits from this PR are "Stop using attributes ..."
I tried to separate the changes into fairly meaningful commits. I can't split them up into smaller PRs, because everything starts working and all tests pass only after the whole sequence, but hopefully this will make reviewing somewhat easier.
Known issues/regressions/future tasks:
- `aten::lerp` and `aten::clamp` are no longer fusable
- `CreateAutodiffSubgraphs` needs a rewrite
- It is much more strict now, and will miss a lot of opportunities, especially when viewing ops are involved. Our previous approach was "ignore the assumption on shape availability in gradient formulas to determine differentiability, and hope that shape prop will be robust enough to actually deliver them before we differentiate", which obviously doesn't scale well to more complex cases. We should either work on reducing the size dependency of grad formulas (feasible e.g. for `view`/`reshape`, unfeasible for `squeeze`/`unsqueeze`), or make `CreateAutodiffSubgraphs` integrate some kind of "I could integrate this node into an AD subgraph, but will I be able to infer the shape of its input" reasoning (kind of like a limited shape prop, that doesn't infer anything, and only tells if it *could* infer something).
- It sometimes creates constant-only (or constants + one node) graphs, which is useless
- Broken `aten::add` in auto-batching, because it gained a non-tensor input. I changed the test for pointwise operations to use `aten::mul` instead, but I needed to disable the LSTM cell test. I'm not sure how scalar constants should be implemented in this case, because I don't fully understand our format. cc: ChunliF
- Graph import does some hacks to recover type of constants. This code should be removed once we'll gain the ability to export the IR along with value types.
- There's still a fair amount of dead code that can be removed. I didn't want to make this diff any bigger, and removing it is an easy task.
- Graph fuser could be improved to use signature matching (possibly using `OperatorSet`) instead of basing on node kinds.
- Manual constant propagation for the `ListConstruct` node in `torch/onnx/utils.py` should be replaced with a proper constant propagation pass (or we should ensure that the one we have handles at least this case before we remove this code).
zdevito
Pull Request resolved: https://github.com/pytorch/pytorch/pull/9807
Reviewed By: ezyang
Differential Revision: D9004285
Pulled By: apaszke
fbshipit-source-id: fe88026a765f6b687354add034c86402362508b7
2018-07-27 05:03:44 +00:00
|
|
|
inputs_map[input] = in_const->output();
|
|
|
|
|
}
|
2017-07-21 07:03:45 +00:00
|
|
|
}
|
|
|
|
|
}
|
2017-07-21 18:42:30 +00:00
|
|
|
// copy n into the graph, remapping its inputs to internal nodes
|
2018-12-26 14:52:25 +00:00
|
|
|
Node* in_graph = subgraph.createClone(
|
|
|
|
|
n, [&](Value* k) -> Value* { return inputs_map[k]; });
|
Move at::chunk into the graph fuser (#10178)
Summary:
... to avoid slow at::chunk (it is slow due to tensor initialization). Picking up from #10026
This is done through the following:
1) Absorb starting chunks into FusionGroup as a part of the graph fuser
pass.
2) When compiling a kernel, emit a `std::vector<ConcatDesc>` that describes if an input (of the original graph) will be chunked.
3) When launching a kernel, `use std::vector<ConcatDesc>` to chunk an
input tensor on the CPU. This chunk directly takes in an at::Tensor and creates
four TensorInfo structs in-place in the argument list, bypassing the creation of intermediate Tensors.
- Expect test and correctness test to see if a single chunk is fused
by the graph fuser
- Correctness test for a variety of chunks (dimension = beginning,
middle, end) and tensors (contiguous, non-contiguous, edge case
(splitSize = 1) for both CPU/CUDA
- Expect test for multiple chunks fused into the same kernel and
correctness test.
cc zdevito apaszke
LSTM forward pass, 1 layer, 512 hidden size and input size, 100 seq length, requires_grad=False on all inputs and weights.
After changes:
```
thnn cudnn jit
8.8468 6.5797 9.3470
```
Before changes:
```
thnn cudnn jit
9.9221 6.6539 11.2550
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/10178
Differential Revision: D9382661
Pulled By: zou3519
fbshipit-source-id: 1f8a749208fbdd45559775ce98cf4eb9558448f8
2018-08-18 22:54:25 +00:00
|
|
|
// if n's outputs are already inputs to the fusion group,
|
|
|
|
|
// we need to remove them because n is now inside the fusion group.
|
|
|
|
|
//
|
|
|
|
|
// i.e.,
|
|
|
|
|
// x = f(w); group(x, y, z) becomes group(w, y, z).
|
|
|
|
|
// x, y, z = f(w); group(x, y, z) becomes group(w).
|
|
|
|
|
//
|
2017-07-21 18:42:30 +00:00
|
|
|
// remapping nodes that used the input to the newly-merged node
|
|
|
|
|
// n is not an input when the fusion group is empty
|
Improve const-correctness of JIT.
This started off as a minor fix based on Adam's question, "why is printing
a graph not const" and snowballed into a giant yak shaving exercise.
- The Graph and Node APIs now uniformly enforce deep constness; e.g., if you
get a const Node* or const Graph*, it is not possible to get a non-const
Node*/Graph* somewhere else in the graph (even though the member variables
of these are non-const. Hooray for private access specifier.)
- A big pile of functions got const versions, most notably the printing
functions, and functions for accessing inputs().
- REALLY IMPORTANT, BC-BREAKING CHANGE: inputs() now returns a COPY of the
inputs, rather than a reference to the underlying. I was forced to do this
because there is no way to portably turn a std::vector<Node*> into a
std::vector<const Node*>, which is necessary to provide a const-correct
version of inputs() that enforces deep const-correctness. I then justified
this choice to myself with the observation that outputs() returned a
copy (by necessity), so this makes the API more uniform.
But making this change uncovered two very subtle bugs:
1. If you change functions from returning a reference to returning a copy,
the idiom node->inputs().begin() is no longer valid, because the memory
the iterator points to immediately becomes invalid. THIS SUCKS.
Honestly, we should add a lint rule rejecting calling begin()/end() on
temporaries because this is very dangerous. To excise this pattern from
the codebase, I added begin() and end() methods to Graph, so that we got
rid of the graph->nodes().begin() idiom, which happens to be sound,
despite not returning a reference, because graph_node_list is a
non-owning reference.
2. pybind11 doesn't handle std::vector<Node*> cast out of the box.
Fortunately, I found a simple fix in the GitHub issues tracker
that involved adding an extra type converter. And yes, this
does mean that outputs() in Python never worked correctly.
- New const_graph_node_list, which is a graph_node_list that gives you const
Node*
There are some more miscellaneous improvements:
- Applied CR comment fixes on export.cpp; using replaceInput, and renaming
variables for clarity.
- assertValidInput helper method added, and applied to replaceInput
- Use an explicit function to print THPObjectPtr, otherwise we get
the wrong overload.
Signed-off-by: Edward Z. Yang <ezyang@fb.com>
2017-10-30 03:24:58 +00:00
|
|
|
auto inputs = group->inputs();
|
Move at::chunk into the graph fuser (#10178)
Summary:
... to avoid slow at::chunk (it is slow due to tensor initialization). Picking up from #10026
This is done through the following:
1) Absorb starting chunks into FusionGroup as a part of the graph fuser
pass.
2) When compiling a kernel, emit a `std::vector<ConcatDesc>` that describes if an input (of the original graph) will be chunked.
3) When launching a kernel, `use std::vector<ConcatDesc>` to chunk an
input tensor on the CPU. This chunk directly takes in an at::Tensor and creates
four TensorInfo structs in-place in the argument list, bypassing the creation of intermediate Tensors.
- Expect test and correctness test to see if a single chunk is fused
by the graph fuser
- Correctness test for a variety of chunks (dimension = beginning,
middle, end) and tensors (contiguous, non-contiguous, edge case
(splitSize = 1) for both CPU/CUDA
- Expect test for multiple chunks fused into the same kernel and
correctness test.
cc zdevito apaszke
LSTM forward pass, 1 layer, 512 hidden size and input size, 100 seq length, requires_grad=False on all inputs and weights.
After changes:
```
thnn cudnn jit
8.8468 6.5797 9.3470
```
Before changes:
```
thnn cudnn jit
9.9221 6.6539 11.2550
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/10178
Differential Revision: D9382661
Pulled By: zou3519
fbshipit-source-id: 1f8a749208fbdd45559775ce98cf4eb9558448f8
2018-08-18 22:54:25 +00:00
|
|
|
for (size_t i = 0; i < n->outputs().size(); ++i) {
|
|
|
|
|
auto it = std::find(inputs.begin(), inputs.end(), n->outputs()[i]);
|
2018-12-26 14:52:25 +00:00
|
|
|
if (it != inputs.end()) {
|
Move at::chunk into the graph fuser (#10178)
Summary:
... to avoid slow at::chunk (it is slow due to tensor initialization). Picking up from #10026
This is done through the following:
1) Absorb starting chunks into FusionGroup as a part of the graph fuser
pass.
2) When compiling a kernel, emit a `std::vector<ConcatDesc>` that describes if an input (of the original graph) will be chunked.
3) When launching a kernel, `use std::vector<ConcatDesc>` to chunk an
input tensor on the CPU. This chunk directly takes in an at::Tensor and creates
four TensorInfo structs in-place in the argument list, bypassing the creation of intermediate Tensors.
- Expect test and correctness test to see if a single chunk is fused
by the graph fuser
- Correctness test for a variety of chunks (dimension = beginning,
middle, end) and tensors (contiguous, non-contiguous, edge case
(splitSize = 1) for both CPU/CUDA
- Expect test for multiple chunks fused into the same kernel and
correctness test.
cc zdevito apaszke
LSTM forward pass, 1 layer, 512 hidden size and input size, 100 seq length, requires_grad=False on all inputs and weights.
After changes:
```
thnn cudnn jit
8.8468 6.5797 9.3470
```
Before changes:
```
thnn cudnn jit
9.9221 6.6539 11.2550
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/10178
Differential Revision: D9382661
Pulled By: zou3519
fbshipit-source-id: 1f8a749208fbdd45559775ce98cf4eb9558448f8
2018-08-18 22:54:25 +00:00
|
|
|
size_t p = it - inputs.begin();
|
|
|
|
|
group->removeInput(p);
|
|
|
|
|
subgraph.inputs()[p]->replaceAllUsesWith(in_graph->outputs()[i]);
|
|
|
|
|
subgraph.eraseInput(p);
|
|
|
|
|
}
|
2017-07-21 07:03:45 +00:00
|
|
|
}
|
2018-08-17 21:08:05 +00:00
|
|
|
return subgraph.insertNode(in_graph);
|
2017-07-21 07:03:45 +00:00
|
|
|
}
|
2017-07-24 01:38:55 +00:00
|
|
|
|
2017-07-21 18:42:30 +00:00
|
|
|
// turn consumer node n into a fusion group with just n inside
|
|
|
|
|
// to prepare for fusion and replace uses of n with the new group
|
2018-12-26 14:52:25 +00:00
|
|
|
Node* createSingletonFusionGroup(Node* n) {
|
2019-05-07 06:11:58 +00:00
|
|
|
auto group = block_->owningGraph()->createWithSubgraph(kind_);
|
2017-07-21 18:42:30 +00:00
|
|
|
// propogate position information for the new node so we can always
|
|
|
|
|
// have a valid mapping
|
2017-07-21 07:03:45 +00:00
|
|
|
group->insertBefore(n);
|
2018-12-26 14:52:25 +00:00
|
|
|
Node* mergedNode = mergeNodeIntoGroup(group, n);
|
Refactor ir.h to distinguish Nodes and Values
This commit adds a Value type similar to the one @ezyang suggested a while
ago for handling multi-return nodes.
Previously if we had a graph like:
a = op1(b)
c, d = op2(a)
Then its in-memory format would look like:
%0 = op1(b)
%1 = op2(%0)
%2 = select(%1, 0)
%2 = select(%1, 1)
Select nodes were used only to handle the multi-output case. In the
single-output case ops referred directly to their uses.
This required special handling for the single- and multi- output cases,
and was confusing when used with ONNX which distinguishes values (the
inputs/outputs of a node) from the nodes themselves (e.g. a Conv).
This commit adds the Node/Value distinction to the IR. In the example
above, `a`, `b`, `c`, and `d` are now Value objects, while `op1` and
`op2` are now Node objects. Inputs/Outputs to the graph are values.
* Nodes now always have multiple outputs, accessible through their `output()`
method.
* Methods exist for adding/removing outputs from a node.
* Nodes own their output Values, destroying a node destroys its outputs and it
is only valid to destroy a node when no uses of its outputs remain.
* Unlike select, Values do not appear in the nodes list.
* The method `node()` on `Value` retrieves its defining node. Calling it
is always valid. For inputs, its kind is "Param". Like "Return" there is a single Param
node representing all inputs.
* For single-output Nodes, the method `output()` retrieves the single
output Value, asserting that the node is in-fact single output.
* Functions are the same, but some functions like `type()` have moved to
Value.
* `replaceAllUsesWith` is now sanely defined for both Values and Nodes.
In the case of Nodes, it replaces all outputs of the node with the outputs
of the replacement node.
* stage is defined both on Node/Value. This is because Inputs require a stage.
* Apart from changing data types from Node->Value most passes remain the same.
Things that previously assumed single-output nodes now have to call output()
to get the node.
* This removes the uses = [...] field in the outputs because it was
getting confusing even before this commit when uses would refer to nodes,
but we print the names of Values. The lint pass validates the use list,
so printing it out seems less necessary.
2017-11-14 11:05:32 +00:00
|
|
|
getSubgraph(group).registerOutput(mergedNode->output());
|
|
|
|
|
auto sel = group->addOutput();
|
|
|
|
|
sel->copyMetadata(n->output());
|
|
|
|
|
n->replaceAllUsesWith(group);
|
2017-07-24 01:38:55 +00:00
|
|
|
n->destroy();
|
2017-07-21 07:03:45 +00:00
|
|
|
return group;
|
|
|
|
|
}
|
|
|
|
|
|
2019-01-10 20:25:22 +00:00
|
|
|
at::optional<Node*> tryFuse(Node* consumer, Value* producer) {
|
2018-12-26 14:52:25 +00:00
|
|
|
// this handles cases where producer can be moved _into_ the fusion group of
|
|
|
|
|
// consumer.
|
2018-10-30 18:08:30 +00:00
|
|
|
// TODO: extend to fusion of consumer into _producer's_ fusion blob
|
|
|
|
|
// if the consumer allInputsAreThisProducer(consumer,producer)
|
|
|
|
|
// we can move the consumer up into the producer.
|
2018-12-26 14:52:25 +00:00
|
|
|
// but this requires better handling of merging fusion groups so it is not
|
|
|
|
|
// done now
|
2019-05-08 19:02:13 +00:00
|
|
|
bool shouldFuse = isFusable(producer->node()) &&
|
2018-10-30 18:08:30 +00:00
|
|
|
// Rearrange nodes such that all uses of producer are after the
|
|
|
|
|
// consumer. Fusion will rewrite those later uses to use the version of
|
|
|
|
|
// producer generated by the fused blob. In this case, producer becomes
|
|
|
|
|
// an output of the fusion group.
|
2019-01-12 04:04:14 +00:00
|
|
|
aliasDb_->moveBeforeTopologicallyValid(producer->node(), consumer);
|
2018-10-30 18:08:30 +00:00
|
|
|
|
|
|
|
|
if (!shouldFuse) {
|
|
|
|
|
return at::nullopt;
|
|
|
|
|
}
|
2019-04-10 05:29:33 +00:00
|
|
|
|
2019-05-06 19:33:45 +00:00
|
|
|
if ((consumer->inputs().size() + consumer->outputs().size() +
|
|
|
|
|
producer->node()->inputs().size() +
|
2019-07-11 20:26:39 +00:00
|
|
|
producer->node()->outputs().size()) > subgraph_arg_limit_) {
|
2019-05-06 19:33:45 +00:00
|
|
|
return at::nullopt;
|
2019-04-10 05:29:33 +00:00
|
|
|
}
|
|
|
|
|
|
2017-08-17 08:46:38 +00:00
|
|
|
auto group = consumer;
|
2019-05-07 06:11:58 +00:00
|
|
|
if (consumer->kind() != kind_) {
|
2017-07-21 07:03:45 +00:00
|
|
|
group = createSingletonFusionGroup(consumer);
|
|
|
|
|
}
|
2019-05-07 06:11:58 +00:00
|
|
|
|
|
|
|
|
if (producer->node()->kind() == kind_) {
|
2017-11-26 20:26:45 +00:00
|
|
|
mergeFusionGroups(group, producer->node());
|
|
|
|
|
return group;
|
|
|
|
|
}
|
2019-01-24 19:05:07 +00:00
|
|
|
AT_ASSERT(producer->node()->outputs().size() == 1);
|
2018-12-26 14:52:25 +00:00
|
|
|
Node* merged = mergeNodeIntoGroup(group, producer->node());
|
2017-07-21 18:42:30 +00:00
|
|
|
// remaining uses of this producer can occur because we allow
|
|
|
|
|
// fusion in cases where uses remain after the consumer
|
|
|
|
|
// if these exist, re-route them to the version of producer
|
|
|
|
|
// created in FusionGroup
|
2018-12-26 14:52:25 +00:00
|
|
|
if (producer->uses().size() != 0) {
|
Refactor ir.h to distinguish Nodes and Values
This commit adds a Value type similar to the one @ezyang suggested a while
ago for handling multi-return nodes.
Previously if we had a graph like:
a = op1(b)
c, d = op2(a)
Then its in-memory format would look like:
%0 = op1(b)
%1 = op2(%0)
%2 = select(%1, 0)
%2 = select(%1, 1)
Select nodes were used only to handle the multi-output case. In the
single-output case ops referred directly to their uses.
This required special handling for the single- and multi- output cases,
and was confusing when used with ONNX which distinguishes values (the
inputs/outputs of a node) from the nodes themselves (e.g. a Conv).
This commit adds the Node/Value distinction to the IR. In the example
above, `a`, `b`, `c`, and `d` are now Value objects, while `op1` and
`op2` are now Node objects. Inputs/Outputs to the graph are values.
* Nodes now always have multiple outputs, accessible through their `output()`
method.
* Methods exist for adding/removing outputs from a node.
* Nodes own their output Values, destroying a node destroys its outputs and it
is only valid to destroy a node when no uses of its outputs remain.
* Unlike select, Values do not appear in the nodes list.
* The method `node()` on `Value` retrieves its defining node. Calling it
is always valid. For inputs, its kind is "Param". Like "Return" there is a single Param
node representing all inputs.
* For single-output Nodes, the method `output()` retrieves the single
output Value, asserting that the node is in-fact single output.
* Functions are the same, but some functions like `type()` have moved to
Value.
* `replaceAllUsesWith` is now sanely defined for both Values and Nodes.
In the case of Nodes, it replaces all outputs of the node with the outputs
of the replacement node.
* stage is defined both on Node/Value. This is because Inputs require a stage.
* Apart from changing data types from Node->Value most passes remain the same.
Things that previously assumed single-output nodes now have to call output()
to get the node.
* This removes the uses = [...] field in the outputs because it was
getting confusing even before this commit when uses would refer to nodes,
but we print the names of Values. The lint pass validates the use list,
so printing it out seems less necessary.
2017-11-14 11:05:32 +00:00
|
|
|
getSubgraph(group).registerOutput(merged->output());
|
2018-12-26 14:52:25 +00:00
|
|
|
Value* new_producer = group->addOutput();
|
Refactor ir.h to distinguish Nodes and Values
This commit adds a Value type similar to the one @ezyang suggested a while
ago for handling multi-return nodes.
Previously if we had a graph like:
a = op1(b)
c, d = op2(a)
Then its in-memory format would look like:
%0 = op1(b)
%1 = op2(%0)
%2 = select(%1, 0)
%2 = select(%1, 1)
Select nodes were used only to handle the multi-output case. In the
single-output case ops referred directly to their uses.
This required special handling for the single- and multi- output cases,
and was confusing when used with ONNX which distinguishes values (the
inputs/outputs of a node) from the nodes themselves (e.g. a Conv).
This commit adds the Node/Value distinction to the IR. In the example
above, `a`, `b`, `c`, and `d` are now Value objects, while `op1` and
`op2` are now Node objects. Inputs/Outputs to the graph are values.
* Nodes now always have multiple outputs, accessible through their `output()`
method.
* Methods exist for adding/removing outputs from a node.
* Nodes own their output Values, destroying a node destroys its outputs and it
is only valid to destroy a node when no uses of its outputs remain.
* Unlike select, Values do not appear in the nodes list.
* The method `node()` on `Value` retrieves its defining node. Calling it
is always valid. For inputs, its kind is "Param". Like "Return" there is a single Param
node representing all inputs.
* For single-output Nodes, the method `output()` retrieves the single
output Value, asserting that the node is in-fact single output.
* Functions are the same, but some functions like `type()` have moved to
Value.
* `replaceAllUsesWith` is now sanely defined for both Values and Nodes.
In the case of Nodes, it replaces all outputs of the node with the outputs
of the replacement node.
* stage is defined both on Node/Value. This is because Inputs require a stage.
* Apart from changing data types from Node->Value most passes remain the same.
Things that previously assumed single-output nodes now have to call output()
to get the node.
* This removes the uses = [...] field in the outputs because it was
getting confusing even before this commit when uses would refer to nodes,
but we print the names of Values. The lint pass validates the use list,
so printing it out seems less necessary.
2017-11-14 11:05:32 +00:00
|
|
|
new_producer->copyMetadata(producer);
|
2017-07-21 07:03:45 +00:00
|
|
|
producer->replaceAllUsesWith(new_producer);
|
|
|
|
|
}
|
Refactor ir.h to distinguish Nodes and Values
This commit adds a Value type similar to the one @ezyang suggested a while
ago for handling multi-return nodes.
Previously if we had a graph like:
a = op1(b)
c, d = op2(a)
Then its in-memory format would look like:
%0 = op1(b)
%1 = op2(%0)
%2 = select(%1, 0)
%2 = select(%1, 1)
Select nodes were used only to handle the multi-output case. In the
single-output case ops referred directly to their uses.
This required special handling for the single- and multi- output cases,
and was confusing when used with ONNX which distinguishes values (the
inputs/outputs of a node) from the nodes themselves (e.g. a Conv).
This commit adds the Node/Value distinction to the IR. In the example
above, `a`, `b`, `c`, and `d` are now Value objects, while `op1` and
`op2` are now Node objects. Inputs/Outputs to the graph are values.
* Nodes now always have multiple outputs, accessible through their `output()`
method.
* Methods exist for adding/removing outputs from a node.
* Nodes own their output Values, destroying a node destroys its outputs and it
is only valid to destroy a node when no uses of its outputs remain.
* Unlike select, Values do not appear in the nodes list.
* The method `node()` on `Value` retrieves its defining node. Calling it
is always valid. For inputs, its kind is "Param". Like "Return" there is a single Param
node representing all inputs.
* For single-output Nodes, the method `output()` retrieves the single
output Value, asserting that the node is in-fact single output.
* Functions are the same, but some functions like `type()` have moved to
Value.
* `replaceAllUsesWith` is now sanely defined for both Values and Nodes.
In the case of Nodes, it replaces all outputs of the node with the outputs
of the replacement node.
* stage is defined both on Node/Value. This is because Inputs require a stage.
* Apart from changing data types from Node->Value most passes remain the same.
Things that previously assumed single-output nodes now have to call output()
to get the node.
* This removes the uses = [...] field in the outputs because it was
getting confusing even before this commit when uses would refer to nodes,
but we print the names of Values. The lint pass validates the use list,
so printing it out seems less necessary.
2017-11-14 11:05:32 +00:00
|
|
|
producer->node()->destroy();
|
2017-07-21 07:03:45 +00:00
|
|
|
return group;
|
|
|
|
|
}
|
|
|
|
|
|
Move at::chunk into the graph fuser (#10178)
Summary:
... to avoid slow at::chunk (it is slow due to tensor initialization). Picking up from #10026
This is done through the following:
1) Absorb starting chunks into FusionGroup as a part of the graph fuser
pass.
2) When compiling a kernel, emit a `std::vector<ConcatDesc>` that describes if an input (of the original graph) will be chunked.
3) When launching a kernel, `use std::vector<ConcatDesc>` to chunk an
input tensor on the CPU. This chunk directly takes in an at::Tensor and creates
four TensorInfo structs in-place in the argument list, bypassing the creation of intermediate Tensors.
- Expect test and correctness test to see if a single chunk is fused
by the graph fuser
- Correctness test for a variety of chunks (dimension = beginning,
middle, end) and tensors (contiguous, non-contiguous, edge case
(splitSize = 1) for both CPU/CUDA
- Expect test for multiple chunks fused into the same kernel and
correctness test.
cc zdevito apaszke
LSTM forward pass, 1 layer, 512 hidden size and input size, 100 seq length, requires_grad=False on all inputs and weights.
After changes:
```
thnn cudnn jit
8.8468 6.5797 9.3470
```
Before changes:
```
thnn cudnn jit
9.9221 6.6539 11.2550
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/10178
Differential Revision: D9382661
Pulled By: zou3519
fbshipit-source-id: 1f8a749208fbdd45559775ce98cf4eb9558448f8
2018-08-18 22:54:25 +00:00
|
|
|
bool canFuseChunk(Node* consumer, Value* producer) {
|
|
|
|
|
if (consumer->kind() != prim::FusionGroup) {
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
// Does the chunk have constant chunks/dim?
|
2018-12-26 14:52:25 +00:00
|
|
|
auto* chunk = producer->node();
|
2018-08-31 02:49:34 +00:00
|
|
|
if (chunk->kind() != prim::ConstantChunk)
|
2018-12-26 14:52:25 +00:00
|
|
|
return false;
|
Prevent JIT from overspecializing to every single size configuration (#10844)
Summary:
Please review the expects carefully to make sure there are no regressions. I tried to go over them one by one when they changed, but it's sometimes easy to miss finer details.
Summary of changes:
- Renamed `TensorType` to `CompleteTensorType`. Added a new `TensorType` which records only the scalar type, number of dimensions, and device of a value. The argument behind the rename is to encourage people to use `CompleteTensorType` less, as most passes will only have limited information available. To make transition easier `complete_type->cast<TensorType>()` works, and makes our passes work with both kinds of specialization if they don't need extra the extra detail.
- Renamed `ArgumentSpec` to `CompleteArgumentSpec`. Added a new `ArgumentSpec`, which matches argument only at the level of the new `TensorType`.
- Shape analysis can process graphs with both `CompleteTensorType` and `TensorType`.
- Fuser was a part that heavily relied on full shape information being available. Now, we simply try to fuse the largest possible graphs, and have to do run-time checks to make sure they match the code we generate. If they don't, we fall back to regular interpretation. The shape checks are implementing using an optimized method exploiting algebraic properties of shapes with broadcasting, and the relations of broadcasting with pointwise ops. A full written proof of correctness of the shape checking algorithm is included in a comment in `graph_fuser.cpp`.
zdevito ezyang mruberry ngimel csarofeen
Pull Request resolved: https://github.com/pytorch/pytorch/pull/10844
Differential Revision: D9498705
Pulled By: apaszke
fbshipit-source-id: 0c53c2fcebd871cc2a29c260f8d012276479cc61
2018-08-26 16:40:58 +00:00
|
|
|
// And all uses of the chunk are in this consumer
|
Move at::chunk into the graph fuser (#10178)
Summary:
... to avoid slow at::chunk (it is slow due to tensor initialization). Picking up from #10026
This is done through the following:
1) Absorb starting chunks into FusionGroup as a part of the graph fuser
pass.
2) When compiling a kernel, emit a `std::vector<ConcatDesc>` that describes if an input (of the original graph) will be chunked.
3) When launching a kernel, `use std::vector<ConcatDesc>` to chunk an
input tensor on the CPU. This chunk directly takes in an at::Tensor and creates
four TensorInfo structs in-place in the argument list, bypassing the creation of intermediate Tensors.
- Expect test and correctness test to see if a single chunk is fused
by the graph fuser
- Correctness test for a variety of chunks (dimension = beginning,
middle, end) and tensors (contiguous, non-contiguous, edge case
(splitSize = 1) for both CPU/CUDA
- Expect test for multiple chunks fused into the same kernel and
correctness test.
cc zdevito apaszke
LSTM forward pass, 1 layer, 512 hidden size and input size, 100 seq length, requires_grad=False on all inputs and weights.
After changes:
```
thnn cudnn jit
8.8468 6.5797 9.3470
```
Before changes:
```
thnn cudnn jit
9.9221 6.6539 11.2550
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/10178
Differential Revision: D9382661
Pulled By: zou3519
fbshipit-source-id: 1f8a749208fbdd45559775ce98cf4eb9558448f8
2018-08-18 22:54:25 +00:00
|
|
|
for (auto s : chunk->outputs()) {
|
|
|
|
|
for (auto u : s->uses()) {
|
|
|
|
|
if (u.user != consumer) {
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
Prevent JIT from overspecializing to every single size configuration (#10844)
Summary:
Please review the expects carefully to make sure there are no regressions. I tried to go over them one by one when they changed, but it's sometimes easy to miss finer details.
Summary of changes:
- Renamed `TensorType` to `CompleteTensorType`. Added a new `TensorType` which records only the scalar type, number of dimensions, and device of a value. The argument behind the rename is to encourage people to use `CompleteTensorType` less, as most passes will only have limited information available. To make transition easier `complete_type->cast<TensorType>()` works, and makes our passes work with both kinds of specialization if they don't need extra the extra detail.
- Renamed `ArgumentSpec` to `CompleteArgumentSpec`. Added a new `ArgumentSpec`, which matches argument only at the level of the new `TensorType`.
- Shape analysis can process graphs with both `CompleteTensorType` and `TensorType`.
- Fuser was a part that heavily relied on full shape information being available. Now, we simply try to fuse the largest possible graphs, and have to do run-time checks to make sure they match the code we generate. If they don't, we fall back to regular interpretation. The shape checks are implementing using an optimized method exploiting algebraic properties of shapes with broadcasting, and the relations of broadcasting with pointwise ops. A full written proof of correctness of the shape checking algorithm is included in a comment in `graph_fuser.cpp`.
zdevito ezyang mruberry ngimel csarofeen
Pull Request resolved: https://github.com/pytorch/pytorch/pull/10844
Differential Revision: D9498705
Pulled By: apaszke
fbshipit-source-id: 0c53c2fcebd871cc2a29c260f8d012276479cc61
2018-08-26 16:40:58 +00:00
|
|
|
// And isn't a no-op chunk (chunks == 1). Have CSE clean this up.
|
Move at::chunk into the graph fuser (#10178)
Summary:
... to avoid slow at::chunk (it is slow due to tensor initialization). Picking up from #10026
This is done through the following:
1) Absorb starting chunks into FusionGroup as a part of the graph fuser
pass.
2) When compiling a kernel, emit a `std::vector<ConcatDesc>` that describes if an input (of the original graph) will be chunked.
3) When launching a kernel, `use std::vector<ConcatDesc>` to chunk an
input tensor on the CPU. This chunk directly takes in an at::Tensor and creates
four TensorInfo structs in-place in the argument list, bypassing the creation of intermediate Tensors.
- Expect test and correctness test to see if a single chunk is fused
by the graph fuser
- Correctness test for a variety of chunks (dimension = beginning,
middle, end) and tensors (contiguous, non-contiguous, edge case
(splitSize = 1) for both CPU/CUDA
- Expect test for multiple chunks fused into the same kernel and
correctness test.
cc zdevito apaszke
LSTM forward pass, 1 layer, 512 hidden size and input size, 100 seq length, requires_grad=False on all inputs and weights.
After changes:
```
thnn cudnn jit
8.8468 6.5797 9.3470
```
Before changes:
```
thnn cudnn jit
9.9221 6.6539 11.2550
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/10178
Differential Revision: D9382661
Pulled By: zou3519
fbshipit-source-id: 1f8a749208fbdd45559775ce98cf4eb9558448f8
2018-08-18 22:54:25 +00:00
|
|
|
// We could fuse this but it's better to just delete the node.
|
2018-08-31 02:49:34 +00:00
|
|
|
if (chunk->i(attr::chunks) == 1) {
|
Move at::chunk into the graph fuser (#10178)
Summary:
... to avoid slow at::chunk (it is slow due to tensor initialization). Picking up from #10026
This is done through the following:
1) Absorb starting chunks into FusionGroup as a part of the graph fuser
pass.
2) When compiling a kernel, emit a `std::vector<ConcatDesc>` that describes if an input (of the original graph) will be chunked.
3) When launching a kernel, `use std::vector<ConcatDesc>` to chunk an
input tensor on the CPU. This chunk directly takes in an at::Tensor and creates
four TensorInfo structs in-place in the argument list, bypassing the creation of intermediate Tensors.
- Expect test and correctness test to see if a single chunk is fused
by the graph fuser
- Correctness test for a variety of chunks (dimension = beginning,
middle, end) and tensors (contiguous, non-contiguous, edge case
(splitSize = 1) for both CPU/CUDA
- Expect test for multiple chunks fused into the same kernel and
correctness test.
cc zdevito apaszke
LSTM forward pass, 1 layer, 512 hidden size and input size, 100 seq length, requires_grad=False on all inputs and weights.
After changes:
```
thnn cudnn jit
8.8468 6.5797 9.3470
```
Before changes:
```
thnn cudnn jit
9.9221 6.6539 11.2550
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/10178
Differential Revision: D9382661
Pulled By: zou3519
fbshipit-source-id: 1f8a749208fbdd45559775ce98cf4eb9558448f8
2018-08-18 22:54:25 +00:00
|
|
|
return false;
|
|
|
|
|
}
|
Prevent JIT from overspecializing to every single size configuration (#10844)
Summary:
Please review the expects carefully to make sure there are no regressions. I tried to go over them one by one when they changed, but it's sometimes easy to miss finer details.
Summary of changes:
- Renamed `TensorType` to `CompleteTensorType`. Added a new `TensorType` which records only the scalar type, number of dimensions, and device of a value. The argument behind the rename is to encourage people to use `CompleteTensorType` less, as most passes will only have limited information available. To make transition easier `complete_type->cast<TensorType>()` works, and makes our passes work with both kinds of specialization if they don't need extra the extra detail.
- Renamed `ArgumentSpec` to `CompleteArgumentSpec`. Added a new `ArgumentSpec`, which matches argument only at the level of the new `TensorType`.
- Shape analysis can process graphs with both `CompleteTensorType` and `TensorType`.
- Fuser was a part that heavily relied on full shape information being available. Now, we simply try to fuse the largest possible graphs, and have to do run-time checks to make sure they match the code we generate. If they don't, we fall back to regular interpretation. The shape checks are implementing using an optimized method exploiting algebraic properties of shapes with broadcasting, and the relations of broadcasting with pointwise ops. A full written proof of correctness of the shape checking algorithm is included in a comment in `graph_fuser.cpp`.
zdevito ezyang mruberry ngimel csarofeen
Pull Request resolved: https://github.com/pytorch/pytorch/pull/10844
Differential Revision: D9498705
Pulled By: apaszke
fbshipit-source-id: 0c53c2fcebd871cc2a29c260f8d012276479cc61
2018-08-26 16:40:58 +00:00
|
|
|
return true;
|
Move at::chunk into the graph fuser (#10178)
Summary:
... to avoid slow at::chunk (it is slow due to tensor initialization). Picking up from #10026
This is done through the following:
1) Absorb starting chunks into FusionGroup as a part of the graph fuser
pass.
2) When compiling a kernel, emit a `std::vector<ConcatDesc>` that describes if an input (of the original graph) will be chunked.
3) When launching a kernel, `use std::vector<ConcatDesc>` to chunk an
input tensor on the CPU. This chunk directly takes in an at::Tensor and creates
four TensorInfo structs in-place in the argument list, bypassing the creation of intermediate Tensors.
- Expect test and correctness test to see if a single chunk is fused
by the graph fuser
- Correctness test for a variety of chunks (dimension = beginning,
middle, end) and tensors (contiguous, non-contiguous, edge case
(splitSize = 1) for both CPU/CUDA
- Expect test for multiple chunks fused into the same kernel and
correctness test.
cc zdevito apaszke
LSTM forward pass, 1 layer, 512 hidden size and input size, 100 seq length, requires_grad=False on all inputs and weights.
After changes:
```
thnn cudnn jit
8.8468 6.5797 9.3470
```
Before changes:
```
thnn cudnn jit
9.9221 6.6539 11.2550
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/10178
Differential Revision: D9382661
Pulled By: zou3519
fbshipit-source-id: 1f8a749208fbdd45559775ce98cf4eb9558448f8
2018-08-18 22:54:25 +00:00
|
|
|
}
|
|
|
|
|
|
2018-10-15 20:28:08 +00:00
|
|
|
c10::optional<Node*> findFusedChunk(Node* group, Value* input) {
|
2019-01-24 19:05:07 +00:00
|
|
|
AT_ASSERT(group->kind() == prim::FusionGroup);
|
Move at::chunk into the graph fuser (#10178)
Summary:
... to avoid slow at::chunk (it is slow due to tensor initialization). Picking up from #10026
This is done through the following:
1) Absorb starting chunks into FusionGroup as a part of the graph fuser
pass.
2) When compiling a kernel, emit a `std::vector<ConcatDesc>` that describes if an input (of the original graph) will be chunked.
3) When launching a kernel, `use std::vector<ConcatDesc>` to chunk an
input tensor on the CPU. This chunk directly takes in an at::Tensor and creates
four TensorInfo structs in-place in the argument list, bypassing the creation of intermediate Tensors.
- Expect test and correctness test to see if a single chunk is fused
by the graph fuser
- Correctness test for a variety of chunks (dimension = beginning,
middle, end) and tensors (contiguous, non-contiguous, edge case
(splitSize = 1) for both CPU/CUDA
- Expect test for multiple chunks fused into the same kernel and
correctness test.
cc zdevito apaszke
LSTM forward pass, 1 layer, 512 hidden size and input size, 100 seq length, requires_grad=False on all inputs and weights.
After changes:
```
thnn cudnn jit
8.8468 6.5797 9.3470
```
Before changes:
```
thnn cudnn jit
9.9221 6.6539 11.2550
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/10178
Differential Revision: D9382661
Pulled By: zou3519
fbshipit-source-id: 1f8a749208fbdd45559775ce98cf4eb9558448f8
2018-08-18 22:54:25 +00:00
|
|
|
auto it = std::find(group->inputs().begin(), group->inputs().end(), input);
|
|
|
|
|
if (it == group->inputs().end()) {
|
2018-10-15 20:28:08 +00:00
|
|
|
return c10::nullopt;
|
Move at::chunk into the graph fuser (#10178)
Summary:
... to avoid slow at::chunk (it is slow due to tensor initialization). Picking up from #10026
This is done through the following:
1) Absorb starting chunks into FusionGroup as a part of the graph fuser
pass.
2) When compiling a kernel, emit a `std::vector<ConcatDesc>` that describes if an input (of the original graph) will be chunked.
3) When launching a kernel, `use std::vector<ConcatDesc>` to chunk an
input tensor on the CPU. This chunk directly takes in an at::Tensor and creates
four TensorInfo structs in-place in the argument list, bypassing the creation of intermediate Tensors.
- Expect test and correctness test to see if a single chunk is fused
by the graph fuser
- Correctness test for a variety of chunks (dimension = beginning,
middle, end) and tensors (contiguous, non-contiguous, edge case
(splitSize = 1) for both CPU/CUDA
- Expect test for multiple chunks fused into the same kernel and
correctness test.
cc zdevito apaszke
LSTM forward pass, 1 layer, 512 hidden size and input size, 100 seq length, requires_grad=False on all inputs and weights.
After changes:
```
thnn cudnn jit
8.8468 6.5797 9.3470
```
Before changes:
```
thnn cudnn jit
9.9221 6.6539 11.2550
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/10178
Differential Revision: D9382661
Pulled By: zou3519
fbshipit-source-id: 1f8a749208fbdd45559775ce98cf4eb9558448f8
2018-08-18 22:54:25 +00:00
|
|
|
}
|
|
|
|
|
size_t input_index = it - group->inputs().begin();
|
2018-12-26 14:52:25 +00:00
|
|
|
auto& subgraph = getSubgraph(group);
|
|
|
|
|
auto* subgraph_input = subgraph.inputs().at(input_index);
|
2018-08-31 02:49:34 +00:00
|
|
|
// If subgraph_input is an input to prim::ConstantChunk, it will have 1 use
|
2018-12-26 14:52:25 +00:00
|
|
|
auto* node = subgraph_input->uses().at(0).user;
|
2018-08-31 02:49:34 +00:00
|
|
|
if (node->kind() == prim::ConstantChunk) {
|
2019-01-24 19:05:07 +00:00
|
|
|
AT_ASSERT(subgraph_input->uses().size() == 1);
|
Move at::chunk into the graph fuser (#10178)
Summary:
... to avoid slow at::chunk (it is slow due to tensor initialization). Picking up from #10026
This is done through the following:
1) Absorb starting chunks into FusionGroup as a part of the graph fuser
pass.
2) When compiling a kernel, emit a `std::vector<ConcatDesc>` that describes if an input (of the original graph) will be chunked.
3) When launching a kernel, `use std::vector<ConcatDesc>` to chunk an
input tensor on the CPU. This chunk directly takes in an at::Tensor and creates
four TensorInfo structs in-place in the argument list, bypassing the creation of intermediate Tensors.
- Expect test and correctness test to see if a single chunk is fused
by the graph fuser
- Correctness test for a variety of chunks (dimension = beginning,
middle, end) and tensors (contiguous, non-contiguous, edge case
(splitSize = 1) for both CPU/CUDA
- Expect test for multiple chunks fused into the same kernel and
correctness test.
cc zdevito apaszke
LSTM forward pass, 1 layer, 512 hidden size and input size, 100 seq length, requires_grad=False on all inputs and weights.
After changes:
```
thnn cudnn jit
8.8468 6.5797 9.3470
```
Before changes:
```
thnn cudnn jit
9.9221 6.6539 11.2550
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/10178
Differential Revision: D9382661
Pulled By: zou3519
fbshipit-source-id: 1f8a749208fbdd45559775ce98cf4eb9558448f8
2018-08-18 22:54:25 +00:00
|
|
|
return node;
|
|
|
|
|
}
|
2018-10-15 20:28:08 +00:00
|
|
|
return c10::nullopt;
|
Move at::chunk into the graph fuser (#10178)
Summary:
... to avoid slow at::chunk (it is slow due to tensor initialization). Picking up from #10026
This is done through the following:
1) Absorb starting chunks into FusionGroup as a part of the graph fuser
pass.
2) When compiling a kernel, emit a `std::vector<ConcatDesc>` that describes if an input (of the original graph) will be chunked.
3) When launching a kernel, `use std::vector<ConcatDesc>` to chunk an
input tensor on the CPU. This chunk directly takes in an at::Tensor and creates
four TensorInfo structs in-place in the argument list, bypassing the creation of intermediate Tensors.
- Expect test and correctness test to see if a single chunk is fused
by the graph fuser
- Correctness test for a variety of chunks (dimension = beginning,
middle, end) and tensors (contiguous, non-contiguous, edge case
(splitSize = 1) for both CPU/CUDA
- Expect test for multiple chunks fused into the same kernel and
correctness test.
cc zdevito apaszke
LSTM forward pass, 1 layer, 512 hidden size and input size, 100 seq length, requires_grad=False on all inputs and weights.
After changes:
```
thnn cudnn jit
8.8468 6.5797 9.3470
```
Before changes:
```
thnn cudnn jit
9.9221 6.6539 11.2550
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/10178
Differential Revision: D9382661
Pulled By: zou3519
fbshipit-source-id: 1f8a749208fbdd45559775ce98cf4eb9558448f8
2018-08-18 22:54:25 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void fuseChunkByReusingExistingFusedChunk(
|
2018-12-26 14:52:25 +00:00
|
|
|
Node* group,
|
|
|
|
|
Node* chunk,
|
|
|
|
|
Node* existingFusedChunk) {
|
2018-08-31 02:49:34 +00:00
|
|
|
if (chunk->outputs().size() != existingFusedChunk->outputs().size()) {
|
|
|
|
|
return;
|
|
|
|
|
}
|
2018-12-26 14:52:25 +00:00
|
|
|
auto& subgraph = getSubgraph(group);
|
Move at::chunk into the graph fuser (#10178)
Summary:
... to avoid slow at::chunk (it is slow due to tensor initialization). Picking up from #10026
This is done through the following:
1) Absorb starting chunks into FusionGroup as a part of the graph fuser
pass.
2) When compiling a kernel, emit a `std::vector<ConcatDesc>` that describes if an input (of the original graph) will be chunked.
3) When launching a kernel, `use std::vector<ConcatDesc>` to chunk an
input tensor on the CPU. This chunk directly takes in an at::Tensor and creates
four TensorInfo structs in-place in the argument list, bypassing the creation of intermediate Tensors.
- Expect test and correctness test to see if a single chunk is fused
by the graph fuser
- Correctness test for a variety of chunks (dimension = beginning,
middle, end) and tensors (contiguous, non-contiguous, edge case
(splitSize = 1) for both CPU/CUDA
- Expect test for multiple chunks fused into the same kernel and
correctness test.
cc zdevito apaszke
LSTM forward pass, 1 layer, 512 hidden size and input size, 100 seq length, requires_grad=False on all inputs and weights.
After changes:
```
thnn cudnn jit
8.8468 6.5797 9.3470
```
Before changes:
```
thnn cudnn jit
9.9221 6.6539 11.2550
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/10178
Differential Revision: D9382661
Pulled By: zou3519
fbshipit-source-id: 1f8a749208fbdd45559775ce98cf4eb9558448f8
2018-08-18 22:54:25 +00:00
|
|
|
for (size_t i = 0; i < chunk->outputs().size(); ++i) {
|
|
|
|
|
// Find the input to the FusionGroup (group)
|
2018-12-26 14:52:25 +00:00
|
|
|
auto* replacement_val = existingFusedChunk->outputs().at(i);
|
|
|
|
|
auto* val = chunk->outputs().at(i);
|
Move at::chunk into the graph fuser (#10178)
Summary:
... to avoid slow at::chunk (it is slow due to tensor initialization). Picking up from #10026
This is done through the following:
1) Absorb starting chunks into FusionGroup as a part of the graph fuser
pass.
2) When compiling a kernel, emit a `std::vector<ConcatDesc>` that describes if an input (of the original graph) will be chunked.
3) When launching a kernel, `use std::vector<ConcatDesc>` to chunk an
input tensor on the CPU. This chunk directly takes in an at::Tensor and creates
four TensorInfo structs in-place in the argument list, bypassing the creation of intermediate Tensors.
- Expect test and correctness test to see if a single chunk is fused
by the graph fuser
- Correctness test for a variety of chunks (dimension = beginning,
middle, end) and tensors (contiguous, non-contiguous, edge case
(splitSize = 1) for both CPU/CUDA
- Expect test for multiple chunks fused into the same kernel and
correctness test.
cc zdevito apaszke
LSTM forward pass, 1 layer, 512 hidden size and input size, 100 seq length, requires_grad=False on all inputs and weights.
After changes:
```
thnn cudnn jit
8.8468 6.5797 9.3470
```
Before changes:
```
thnn cudnn jit
9.9221 6.6539 11.2550
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/10178
Differential Revision: D9382661
Pulled By: zou3519
fbshipit-source-id: 1f8a749208fbdd45559775ce98cf4eb9558448f8
2018-08-18 22:54:25 +00:00
|
|
|
auto it = std::find(group->inputs().begin(), group->inputs().end(), val);
|
|
|
|
|
auto input_index = it - group->inputs().begin();
|
|
|
|
|
|
|
|
|
|
// Rewrite the graph to use replacement_val
|
|
|
|
|
auto group_input = subgraph.inputs().at(input_index);
|
|
|
|
|
group_input->replaceAllUsesWith(replacement_val);
|
|
|
|
|
|
|
|
|
|
// Remove the input, it's no longer needed
|
|
|
|
|
group->removeInput(input_index);
|
|
|
|
|
subgraph.eraseInput(input_index);
|
|
|
|
|
}
|
|
|
|
|
chunk->destroy();
|
|
|
|
|
}
|
|
|
|
|
|
2018-08-31 02:49:34 +00:00
|
|
|
// There are two invariants for prim::ConstantChunk:
|
2018-12-26 14:52:25 +00:00
|
|
|
// (1) the tensor input to prim::ConstantChunk must be an input to the fusion
|
|
|
|
|
// group (2) no two ConstantChunks in the same FusionGroup can share a tensor
|
|
|
|
|
// input.
|
|
|
|
|
graph_node_list::iterator fuseChunk(Node* consumer, Value* producer) {
|
|
|
|
|
auto* chunk = producer->node();
|
2019-01-24 19:05:07 +00:00
|
|
|
AT_ASSERT(consumer->kind() == prim::FusionGroup);
|
|
|
|
|
AT_ASSERT(chunk->kind() == prim::ConstantChunk);
|
Move at::chunk into the graph fuser (#10178)
Summary:
... to avoid slow at::chunk (it is slow due to tensor initialization). Picking up from #10026
This is done through the following:
1) Absorb starting chunks into FusionGroup as a part of the graph fuser
pass.
2) When compiling a kernel, emit a `std::vector<ConcatDesc>` that describes if an input (of the original graph) will be chunked.
3) When launching a kernel, `use std::vector<ConcatDesc>` to chunk an
input tensor on the CPU. This chunk directly takes in an at::Tensor and creates
four TensorInfo structs in-place in the argument list, bypassing the creation of intermediate Tensors.
- Expect test and correctness test to see if a single chunk is fused
by the graph fuser
- Correctness test for a variety of chunks (dimension = beginning,
middle, end) and tensors (contiguous, non-contiguous, edge case
(splitSize = 1) for both CPU/CUDA
- Expect test for multiple chunks fused into the same kernel and
correctness test.
cc zdevito apaszke
LSTM forward pass, 1 layer, 512 hidden size and input size, 100 seq length, requires_grad=False on all inputs and weights.
After changes:
```
thnn cudnn jit
8.8468 6.5797 9.3470
```
Before changes:
```
thnn cudnn jit
9.9221 6.6539 11.2550
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/10178
Differential Revision: D9382661
Pulled By: zou3519
fbshipit-source-id: 1f8a749208fbdd45559775ce98cf4eb9558448f8
2018-08-18 22:54:25 +00:00
|
|
|
|
2018-08-31 02:49:34 +00:00
|
|
|
// if producer's input is already an input to a prim::ConstantChunk node,
|
|
|
|
|
// we cannot add a new prim::ConstantChunk node because of invariant (2).
|
2018-12-26 14:52:25 +00:00
|
|
|
auto* chunked_tensor = producer->node()->input();
|
Move at::chunk into the graph fuser (#10178)
Summary:
... to avoid slow at::chunk (it is slow due to tensor initialization). Picking up from #10026
This is done through the following:
1) Absorb starting chunks into FusionGroup as a part of the graph fuser
pass.
2) When compiling a kernel, emit a `std::vector<ConcatDesc>` that describes if an input (of the original graph) will be chunked.
3) When launching a kernel, `use std::vector<ConcatDesc>` to chunk an
input tensor on the CPU. This chunk directly takes in an at::Tensor and creates
four TensorInfo structs in-place in the argument list, bypassing the creation of intermediate Tensors.
- Expect test and correctness test to see if a single chunk is fused
by the graph fuser
- Correctness test for a variety of chunks (dimension = beginning,
middle, end) and tensors (contiguous, non-contiguous, edge case
(splitSize = 1) for both CPU/CUDA
- Expect test for multiple chunks fused into the same kernel and
correctness test.
cc zdevito apaszke
LSTM forward pass, 1 layer, 512 hidden size and input size, 100 seq length, requires_grad=False on all inputs and weights.
After changes:
```
thnn cudnn jit
8.8468 6.5797 9.3470
```
Before changes:
```
thnn cudnn jit
9.9221 6.6539 11.2550
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/10178
Differential Revision: D9382661
Pulled By: zou3519
fbshipit-source-id: 1f8a749208fbdd45559775ce98cf4eb9558448f8
2018-08-18 22:54:25 +00:00
|
|
|
if (auto existingFusedChunk = findFusedChunk(consumer, chunked_tensor)) {
|
2018-12-26 14:52:25 +00:00
|
|
|
fuseChunkByReusingExistingFusedChunk(
|
|
|
|
|
consumer, chunk, *existingFusedChunk);
|
Move at::chunk into the graph fuser (#10178)
Summary:
... to avoid slow at::chunk (it is slow due to tensor initialization). Picking up from #10026
This is done through the following:
1) Absorb starting chunks into FusionGroup as a part of the graph fuser
pass.
2) When compiling a kernel, emit a `std::vector<ConcatDesc>` that describes if an input (of the original graph) will be chunked.
3) When launching a kernel, `use std::vector<ConcatDesc>` to chunk an
input tensor on the CPU. This chunk directly takes in an at::Tensor and creates
four TensorInfo structs in-place in the argument list, bypassing the creation of intermediate Tensors.
- Expect test and correctness test to see if a single chunk is fused
by the graph fuser
- Correctness test for a variety of chunks (dimension = beginning,
middle, end) and tensors (contiguous, non-contiguous, edge case
(splitSize = 1) for both CPU/CUDA
- Expect test for multiple chunks fused into the same kernel and
correctness test.
cc zdevito apaszke
LSTM forward pass, 1 layer, 512 hidden size and input size, 100 seq length, requires_grad=False on all inputs and weights.
After changes:
```
thnn cudnn jit
8.8468 6.5797 9.3470
```
Before changes:
```
thnn cudnn jit
9.9221 6.6539 11.2550
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/10178
Differential Revision: D9382661
Pulled By: zou3519
fbshipit-source-id: 1f8a749208fbdd45559775ce98cf4eb9558448f8
2018-08-18 22:54:25 +00:00
|
|
|
return consumer->reverseIterator();
|
|
|
|
|
}
|
|
|
|
|
|
2018-08-31 02:49:34 +00:00
|
|
|
// Move prim::ConstantChunk into the FusionGroup
|
|
|
|
|
mergeNodeIntoGroup(consumer, chunk);
|
Move at::chunk into the graph fuser (#10178)
Summary:
... to avoid slow at::chunk (it is slow due to tensor initialization). Picking up from #10026
This is done through the following:
1) Absorb starting chunks into FusionGroup as a part of the graph fuser
pass.
2) When compiling a kernel, emit a `std::vector<ConcatDesc>` that describes if an input (of the original graph) will be chunked.
3) When launching a kernel, `use std::vector<ConcatDesc>` to chunk an
input tensor on the CPU. This chunk directly takes in an at::Tensor and creates
four TensorInfo structs in-place in the argument list, bypassing the creation of intermediate Tensors.
- Expect test and correctness test to see if a single chunk is fused
by the graph fuser
- Correctness test for a variety of chunks (dimension = beginning,
middle, end) and tensors (contiguous, non-contiguous, edge case
(splitSize = 1) for both CPU/CUDA
- Expect test for multiple chunks fused into the same kernel and
correctness test.
cc zdevito apaszke
LSTM forward pass, 1 layer, 512 hidden size and input size, 100 seq length, requires_grad=False on all inputs and weights.
After changes:
```
thnn cudnn jit
8.8468 6.5797 9.3470
```
Before changes:
```
thnn cudnn jit
9.9221 6.6539 11.2550
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/10178
Differential Revision: D9382661
Pulled By: zou3519
fbshipit-source-id: 1f8a749208fbdd45559775ce98cf4eb9558448f8
2018-08-18 22:54:25 +00:00
|
|
|
chunk->destroy();
|
|
|
|
|
return consumer->reverseIterator();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
value_list sortReverseTopological(ArrayRef<Value*> inputs) {
|
|
|
|
|
value_list result;
|
|
|
|
|
for (auto i : inputs) {
|
2018-11-22 01:46:46 +00:00
|
|
|
if (i->node()->owningBlock() == block_) {
|
Move at::chunk into the graph fuser (#10178)
Summary:
... to avoid slow at::chunk (it is slow due to tensor initialization). Picking up from #10026
This is done through the following:
1) Absorb starting chunks into FusionGroup as a part of the graph fuser
pass.
2) When compiling a kernel, emit a `std::vector<ConcatDesc>` that describes if an input (of the original graph) will be chunked.
3) When launching a kernel, `use std::vector<ConcatDesc>` to chunk an
input tensor on the CPU. This chunk directly takes in an at::Tensor and creates
four TensorInfo structs in-place in the argument list, bypassing the creation of intermediate Tensors.
- Expect test and correctness test to see if a single chunk is fused
by the graph fuser
- Correctness test for a variety of chunks (dimension = beginning,
middle, end) and tensors (contiguous, non-contiguous, edge case
(splitSize = 1) for both CPU/CUDA
- Expect test for multiple chunks fused into the same kernel and
correctness test.
cc zdevito apaszke
LSTM forward pass, 1 layer, 512 hidden size and input size, 100 seq length, requires_grad=False on all inputs and weights.
After changes:
```
thnn cudnn jit
8.8468 6.5797 9.3470
```
Before changes:
```
thnn cudnn jit
9.9221 6.6539 11.2550
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/10178
Differential Revision: D9382661
Pulled By: zou3519
fbshipit-source-id: 1f8a749208fbdd45559775ce98cf4eb9558448f8
2018-08-18 22:54:25 +00:00
|
|
|
result.push_back(i);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
// Sort in reverse topological order
|
2018-12-26 14:52:25 +00:00
|
|
|
std::sort(result.begin(), result.end(), [&](Value* a, Value* b) {
|
2018-10-23 06:53:06 +00:00
|
|
|
return a->node()->isAfter(b->node());
|
Move at::chunk into the graph fuser (#10178)
Summary:
... to avoid slow at::chunk (it is slow due to tensor initialization). Picking up from #10026
This is done through the following:
1) Absorb starting chunks into FusionGroup as a part of the graph fuser
pass.
2) When compiling a kernel, emit a `std::vector<ConcatDesc>` that describes if an input (of the original graph) will be chunked.
3) When launching a kernel, `use std::vector<ConcatDesc>` to chunk an
input tensor on the CPU. This chunk directly takes in an at::Tensor and creates
four TensorInfo structs in-place in the argument list, bypassing the creation of intermediate Tensors.
- Expect test and correctness test to see if a single chunk is fused
by the graph fuser
- Correctness test for a variety of chunks (dimension = beginning,
middle, end) and tensors (contiguous, non-contiguous, edge case
(splitSize = 1) for both CPU/CUDA
- Expect test for multiple chunks fused into the same kernel and
correctness test.
cc zdevito apaszke
LSTM forward pass, 1 layer, 512 hidden size and input size, 100 seq length, requires_grad=False on all inputs and weights.
After changes:
```
thnn cudnn jit
8.8468 6.5797 9.3470
```
Before changes:
```
thnn cudnn jit
9.9221 6.6539 11.2550
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/10178
Differential Revision: D9382661
Pulled By: zou3519
fbshipit-source-id: 1f8a749208fbdd45559775ce98cf4eb9558448f8
2018-08-18 22:54:25 +00:00
|
|
|
});
|
|
|
|
|
return result;
|
|
|
|
|
}
|
|
|
|
|
|
2018-12-26 14:52:25 +00:00
|
|
|
graph_node_list::iterator scanNodeForChunks(Node* consumer) {
|
Move at::chunk into the graph fuser (#10178)
Summary:
... to avoid slow at::chunk (it is slow due to tensor initialization). Picking up from #10026
This is done through the following:
1) Absorb starting chunks into FusionGroup as a part of the graph fuser
pass.
2) When compiling a kernel, emit a `std::vector<ConcatDesc>` that describes if an input (of the original graph) will be chunked.
3) When launching a kernel, `use std::vector<ConcatDesc>` to chunk an
input tensor on the CPU. This chunk directly takes in an at::Tensor and creates
four TensorInfo structs in-place in the argument list, bypassing the creation of intermediate Tensors.
- Expect test and correctness test to see if a single chunk is fused
by the graph fuser
- Correctness test for a variety of chunks (dimension = beginning,
middle, end) and tensors (contiguous, non-contiguous, edge case
(splitSize = 1) for both CPU/CUDA
- Expect test for multiple chunks fused into the same kernel and
correctness test.
cc zdevito apaszke
LSTM forward pass, 1 layer, 512 hidden size and input size, 100 seq length, requires_grad=False on all inputs and weights.
After changes:
```
thnn cudnn jit
8.8468 6.5797 9.3470
```
Before changes:
```
thnn cudnn jit
9.9221 6.6539 11.2550
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/10178
Differential Revision: D9382661
Pulled By: zou3519
fbshipit-source-id: 1f8a749208fbdd45559775ce98cf4eb9558448f8
2018-08-18 22:54:25 +00:00
|
|
|
if (consumer->kind() == prim::FusionGroup) {
|
|
|
|
|
auto inputs = sortReverseTopological(consumer->inputs());
|
2018-12-26 14:52:25 +00:00
|
|
|
for (auto producer : inputs) {
|
Move at::chunk into the graph fuser (#10178)
Summary:
... to avoid slow at::chunk (it is slow due to tensor initialization). Picking up from #10026
This is done through the following:
1) Absorb starting chunks into FusionGroup as a part of the graph fuser
pass.
2) When compiling a kernel, emit a `std::vector<ConcatDesc>` that describes if an input (of the original graph) will be chunked.
3) When launching a kernel, `use std::vector<ConcatDesc>` to chunk an
input tensor on the CPU. This chunk directly takes in an at::Tensor and creates
four TensorInfo structs in-place in the argument list, bypassing the creation of intermediate Tensors.
- Expect test and correctness test to see if a single chunk is fused
by the graph fuser
- Correctness test for a variety of chunks (dimension = beginning,
middle, end) and tensors (contiguous, non-contiguous, edge case
(splitSize = 1) for both CPU/CUDA
- Expect test for multiple chunks fused into the same kernel and
correctness test.
cc zdevito apaszke
LSTM forward pass, 1 layer, 512 hidden size and input size, 100 seq length, requires_grad=False on all inputs and weights.
After changes:
```
thnn cudnn jit
8.8468 6.5797 9.3470
```
Before changes:
```
thnn cudnn jit
9.9221 6.6539 11.2550
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/10178
Differential Revision: D9382661
Pulled By: zou3519
fbshipit-source-id: 1f8a749208fbdd45559775ce98cf4eb9558448f8
2018-08-18 22:54:25 +00:00
|
|
|
if (!canFuseChunk(consumer, producer)) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
return fuseChunk(consumer, producer);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return ++consumer->reverseIterator();
|
|
|
|
|
}
|
|
|
|
|
|
2019-08-31 18:18:23 +00:00
|
|
|
at::ArrayRef<Value*> broadcast_tensors(value_list inputs) {
|
|
|
|
|
AT_ASSERT(inputs.size() > 0);
|
|
|
|
|
auto* g = inputs[0]->owningGraph();
|
|
|
|
|
auto* input_list =
|
|
|
|
|
g->insertNode(g->createList(TensorType::get(), inputs))->output();
|
|
|
|
|
auto* output_list = g->insert(aten::broadcast_tensors, {input_list});
|
|
|
|
|
auto* unpack_node = g->insertNode(
|
|
|
|
|
g->create(prim::ListUnpack, {output_list}, inputs.size()));
|
|
|
|
|
return unpack_node->outputs();
|
|
|
|
|
}
|
|
|
|
|
|
2018-12-26 14:52:25 +00:00
|
|
|
void insertExplicitBroadcast(Node* node) {
|
|
|
|
|
WithInsertPoint insert_guard{node};
|
Prevent JIT from overspecializing to every single size configuration (#10844)
Summary:
Please review the expects carefully to make sure there are no regressions. I tried to go over them one by one when they changed, but it's sometimes easy to miss finer details.
Summary of changes:
- Renamed `TensorType` to `CompleteTensorType`. Added a new `TensorType` which records only the scalar type, number of dimensions, and device of a value. The argument behind the rename is to encourage people to use `CompleteTensorType` less, as most passes will only have limited information available. To make transition easier `complete_type->cast<TensorType>()` works, and makes our passes work with both kinds of specialization if they don't need extra the extra detail.
- Renamed `ArgumentSpec` to `CompleteArgumentSpec`. Added a new `ArgumentSpec`, which matches argument only at the level of the new `TensorType`.
- Shape analysis can process graphs with both `CompleteTensorType` and `TensorType`.
- Fuser was a part that heavily relied on full shape information being available. Now, we simply try to fuse the largest possible graphs, and have to do run-time checks to make sure they match the code we generate. If they don't, we fall back to regular interpretation. The shape checks are implementing using an optimized method exploiting algebraic properties of shapes with broadcasting, and the relations of broadcasting with pointwise ops. A full written proof of correctness of the shape checking algorithm is included in a comment in `graph_fuser.cpp`.
zdevito ezyang mruberry ngimel csarofeen
Pull Request resolved: https://github.com/pytorch/pytorch/pull/10844
Differential Revision: D9498705
Pulled By: apaszke
fbshipit-source-id: 0c53c2fcebd871cc2a29c260f8d012276479cc61
2018-08-26 16:40:58 +00:00
|
|
|
auto tensors = tensorInputs(node);
|
2019-08-31 18:18:23 +00:00
|
|
|
auto new_tensors = broadcast_tensors(tensors);
|
Prevent JIT from overspecializing to every single size configuration (#10844)
Summary:
Please review the expects carefully to make sure there are no regressions. I tried to go over them one by one when they changed, but it's sometimes easy to miss finer details.
Summary of changes:
- Renamed `TensorType` to `CompleteTensorType`. Added a new `TensorType` which records only the scalar type, number of dimensions, and device of a value. The argument behind the rename is to encourage people to use `CompleteTensorType` less, as most passes will only have limited information available. To make transition easier `complete_type->cast<TensorType>()` works, and makes our passes work with both kinds of specialization if they don't need extra the extra detail.
- Renamed `ArgumentSpec` to `CompleteArgumentSpec`. Added a new `ArgumentSpec`, which matches argument only at the level of the new `TensorType`.
- Shape analysis can process graphs with both `CompleteTensorType` and `TensorType`.
- Fuser was a part that heavily relied on full shape information being available. Now, we simply try to fuse the largest possible graphs, and have to do run-time checks to make sure they match the code we generate. If they don't, we fall back to regular interpretation. The shape checks are implementing using an optimized method exploiting algebraic properties of shapes with broadcasting, and the relations of broadcasting with pointwise ops. A full written proof of correctness of the shape checking algorithm is included in a comment in `graph_fuser.cpp`.
zdevito ezyang mruberry ngimel csarofeen
Pull Request resolved: https://github.com/pytorch/pytorch/pull/10844
Differential Revision: D9498705
Pulled By: apaszke
fbshipit-source-id: 0c53c2fcebd871cc2a29c260f8d012276479cc61
2018-08-26 16:40:58 +00:00
|
|
|
|
|
|
|
|
// Replace tensors inputs with broadcasted values
|
|
|
|
|
auto new_tensors_it = new_tensors.begin();
|
|
|
|
|
for (size_t i = 0; i < node->inputs().size(); ++i) {
|
Rename DynamicType -> TensorType (#16787)
Summary:
```
import json
from subprocess import check_call
from pprint import pprint
renames = {
'c10::TensorType': 'DimentionedTensorType',
'c10::DynamicType': 'TensorType',
'c10::TensorTypePtr': 'DimentionedTensorTypePtr',
'c10::DynamicTypePtr': 'TensorTypePtr',
'c10::TypeKind::DynamicType': 'TensorType',
'c10::TypeKind::TensorType': 'DimentionedTensorType',
}
entries = json.loads(open('compile_commands.json', 'r').read())
build = None
sources = []
for e in entries:
name = e['file']
if not ('jit' in name or 'ATen/core' in name):
continue
build = e['directory']
sources.append(name)
args = ['clang-rename', '-i', '-force', '-pl']
for name in sorted(renames.keys()):
args += ['-qualified-name={}'.format(name), '-new-name={}'.format(renames[name])]
for source in sources:
cmd = args + [source]
pprint(args)
check_call(cmd, cwd=build)
check_call(['git', 'stash', 'push', '-m', 'rename'])
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/16787
Differential Revision: D13974132
Pulled By: zdevito
fbshipit-source-id: 8368fd53e17cff83707bbe77f2d7aad74f8ce60e
2019-02-07 01:22:47 +00:00
|
|
|
if (node->inputs()[i]->type()->isSubtypeOf(TensorType::get())) {
|
2019-01-24 19:05:07 +00:00
|
|
|
AT_ASSERT(new_tensors_it != new_tensors.end());
|
Prevent JIT from overspecializing to every single size configuration (#10844)
Summary:
Please review the expects carefully to make sure there are no regressions. I tried to go over them one by one when they changed, but it's sometimes easy to miss finer details.
Summary of changes:
- Renamed `TensorType` to `CompleteTensorType`. Added a new `TensorType` which records only the scalar type, number of dimensions, and device of a value. The argument behind the rename is to encourage people to use `CompleteTensorType` less, as most passes will only have limited information available. To make transition easier `complete_type->cast<TensorType>()` works, and makes our passes work with both kinds of specialization if they don't need extra the extra detail.
- Renamed `ArgumentSpec` to `CompleteArgumentSpec`. Added a new `ArgumentSpec`, which matches argument only at the level of the new `TensorType`.
- Shape analysis can process graphs with both `CompleteTensorType` and `TensorType`.
- Fuser was a part that heavily relied on full shape information being available. Now, we simply try to fuse the largest possible graphs, and have to do run-time checks to make sure they match the code we generate. If they don't, we fall back to regular interpretation. The shape checks are implementing using an optimized method exploiting algebraic properties of shapes with broadcasting, and the relations of broadcasting with pointwise ops. A full written proof of correctness of the shape checking algorithm is included in a comment in `graph_fuser.cpp`.
zdevito ezyang mruberry ngimel csarofeen
Pull Request resolved: https://github.com/pytorch/pytorch/pull/10844
Differential Revision: D9498705
Pulled By: apaszke
fbshipit-source-id: 0c53c2fcebd871cc2a29c260f8d012276479cc61
2018-08-26 16:40:58 +00:00
|
|
|
node->replaceInput(i, *(new_tensors_it++));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2018-12-26 14:52:25 +00:00
|
|
|
Node* promoteChunkToBroadcastingChunk(Node* chunk) {
|
2019-01-24 19:05:07 +00:00
|
|
|
AT_ASSERT(chunk->kind() == prim::ConstantChunk);
|
Allow graph fuser to move chunks past multiple nodes. (#14055)
Summary:
Fixes #12290. Also speeds up JIT LSTM forward pass from 8.8ms to 7.8ms; previously, each JIT lstm cell used 2 fused kernels. Now, it only uses one fused kernel (which is how many kernels cudnn uses).
Explanation:
Let f, g, h be fusible ops.
```
x = f(v, w)
z = g(x, y)
a, b = chunk(z)
c = h(a, b)
```
becomes (before this PR):
```
x = f(v, w)
x', y' = broadcast_tensors([x, y])
ax, bx = chunk(x')
ay, by = chunk(y')
a = g(ax, ay)
b = g(bx, by)
c = h(a, b)
```
The graph fuser then puts g, g, and h into one FusionGroup and is unable
to move `x = f(v, w)` into the FusionGroup.
This PR lets the graph fuser move `x = f(v, w)` into the FusionGroup.
It does this by abstracting the broadcast_tensors + multiple chunk nodes
into one intermediate `prim::BroadcastingChunk[chunks, dim]` node.
A `BroadcastingChunk[chunks, dim](*inputs)` node is equivalent to:
- broadcasting all of *inputs
- chunk-ing each broadcasted input into `chunks` chunks along dim `dim`.
Abstracting the broadcasting chunk behavior away, it is now a lot easier
for the graph fuser to move (broadcast + chunk) past an operation. After
this PR, the above graph becomes:
```
x = f(v, w)
ax, bx, ay, by = BroadcastingChunk(x, y)
a = g(ax, ay)
b = g(bx, by)
c = h(a, b)
```
Now, to move `x = f(v, w)` after the BroadcastingChunk, one just needs
to add f's operands to the BroadcastingChunk:
```
ay, by, av, bv, aw, bw = BroadcastingChunk(y, v, w)
ax = f(av, aw)
by = f(bv, bw)
a = g(ax, ay)
b = g(bx, by)
c = h(a, b)
```
cc apaszke mruberry zdevito
Pull Request resolved: https://github.com/pytorch/pytorch/pull/14055
Differential Revision: D13159259
Pulled By: zou3519
fbshipit-source-id: 134e9e645c950384d9be6a06a883a10e17a73d7d
2018-11-26 20:28:44 +00:00
|
|
|
|
|
|
|
|
size_t nchunks = chunk->i(attr::chunks);
|
2018-12-26 14:52:25 +00:00
|
|
|
Node* bchunk =
|
|
|
|
|
chunk->owningGraph()->create(prim::BroadcastingChunk, nchunks);
|
Allow graph fuser to move chunks past multiple nodes. (#14055)
Summary:
Fixes #12290. Also speeds up JIT LSTM forward pass from 8.8ms to 7.8ms; previously, each JIT lstm cell used 2 fused kernels. Now, it only uses one fused kernel (which is how many kernels cudnn uses).
Explanation:
Let f, g, h be fusible ops.
```
x = f(v, w)
z = g(x, y)
a, b = chunk(z)
c = h(a, b)
```
becomes (before this PR):
```
x = f(v, w)
x', y' = broadcast_tensors([x, y])
ax, bx = chunk(x')
ay, by = chunk(y')
a = g(ax, ay)
b = g(bx, by)
c = h(a, b)
```
The graph fuser then puts g, g, and h into one FusionGroup and is unable
to move `x = f(v, w)` into the FusionGroup.
This PR lets the graph fuser move `x = f(v, w)` into the FusionGroup.
It does this by abstracting the broadcast_tensors + multiple chunk nodes
into one intermediate `prim::BroadcastingChunk[chunks, dim]` node.
A `BroadcastingChunk[chunks, dim](*inputs)` node is equivalent to:
- broadcasting all of *inputs
- chunk-ing each broadcasted input into `chunks` chunks along dim `dim`.
Abstracting the broadcasting chunk behavior away, it is now a lot easier
for the graph fuser to move (broadcast + chunk) past an operation. After
this PR, the above graph becomes:
```
x = f(v, w)
ax, bx, ay, by = BroadcastingChunk(x, y)
a = g(ax, ay)
b = g(bx, by)
c = h(a, b)
```
Now, to move `x = f(v, w)` after the BroadcastingChunk, one just needs
to add f's operands to the BroadcastingChunk:
```
ay, by, av, bv, aw, bw = BroadcastingChunk(y, v, w)
ax = f(av, aw)
by = f(bv, bw)
a = g(ax, ay)
b = g(bx, by)
c = h(a, b)
```
cc apaszke mruberry zdevito
Pull Request resolved: https://github.com/pytorch/pytorch/pull/14055
Differential Revision: D13159259
Pulled By: zou3519
fbshipit-source-id: 134e9e645c950384d9be6a06a883a10e17a73d7d
2018-11-26 20:28:44 +00:00
|
|
|
bchunk->addInput(chunk->input());
|
|
|
|
|
for (size_t i = 0; i < nchunks; ++i) {
|
2018-12-26 14:52:25 +00:00
|
|
|
auto* old_output = chunk->outputs().at(i);
|
|
|
|
|
auto* new_output = bchunk->outputs().at(i);
|
Allow graph fuser to move chunks past multiple nodes. (#14055)
Summary:
Fixes #12290. Also speeds up JIT LSTM forward pass from 8.8ms to 7.8ms; previously, each JIT lstm cell used 2 fused kernels. Now, it only uses one fused kernel (which is how many kernels cudnn uses).
Explanation:
Let f, g, h be fusible ops.
```
x = f(v, w)
z = g(x, y)
a, b = chunk(z)
c = h(a, b)
```
becomes (before this PR):
```
x = f(v, w)
x', y' = broadcast_tensors([x, y])
ax, bx = chunk(x')
ay, by = chunk(y')
a = g(ax, ay)
b = g(bx, by)
c = h(a, b)
```
The graph fuser then puts g, g, and h into one FusionGroup and is unable
to move `x = f(v, w)` into the FusionGroup.
This PR lets the graph fuser move `x = f(v, w)` into the FusionGroup.
It does this by abstracting the broadcast_tensors + multiple chunk nodes
into one intermediate `prim::BroadcastingChunk[chunks, dim]` node.
A `BroadcastingChunk[chunks, dim](*inputs)` node is equivalent to:
- broadcasting all of *inputs
- chunk-ing each broadcasted input into `chunks` chunks along dim `dim`.
Abstracting the broadcasting chunk behavior away, it is now a lot easier
for the graph fuser to move (broadcast + chunk) past an operation. After
this PR, the above graph becomes:
```
x = f(v, w)
ax, bx, ay, by = BroadcastingChunk(x, y)
a = g(ax, ay)
b = g(bx, by)
c = h(a, b)
```
Now, to move `x = f(v, w)` after the BroadcastingChunk, one just needs
to add f's operands to the BroadcastingChunk:
```
ay, by, av, bv, aw, bw = BroadcastingChunk(y, v, w)
ax = f(av, aw)
by = f(bv, bw)
a = g(ax, ay)
b = g(bx, by)
c = h(a, b)
```
cc apaszke mruberry zdevito
Pull Request resolved: https://github.com/pytorch/pytorch/pull/14055
Differential Revision: D13159259
Pulled By: zou3519
fbshipit-source-id: 134e9e645c950384d9be6a06a883a10e17a73d7d
2018-11-26 20:28:44 +00:00
|
|
|
new_output->copyMetadata(old_output);
|
|
|
|
|
old_output->replaceAllUsesWith(new_output);
|
|
|
|
|
}
|
|
|
|
|
bchunk->copyAttributes(*chunk);
|
|
|
|
|
bchunk->insertAfter(chunk);
|
|
|
|
|
chunk->destroy();
|
|
|
|
|
return bchunk;
|
|
|
|
|
}
|
Move at::chunk into the graph fuser (#10178)
Summary:
... to avoid slow at::chunk (it is slow due to tensor initialization). Picking up from #10026
This is done through the following:
1) Absorb starting chunks into FusionGroup as a part of the graph fuser
pass.
2) When compiling a kernel, emit a `std::vector<ConcatDesc>` that describes if an input (of the original graph) will be chunked.
3) When launching a kernel, `use std::vector<ConcatDesc>` to chunk an
input tensor on the CPU. This chunk directly takes in an at::Tensor and creates
four TensorInfo structs in-place in the argument list, bypassing the creation of intermediate Tensors.
- Expect test and correctness test to see if a single chunk is fused
by the graph fuser
- Correctness test for a variety of chunks (dimension = beginning,
middle, end) and tensors (contiguous, non-contiguous, edge case
(splitSize = 1) for both CPU/CUDA
- Expect test for multiple chunks fused into the same kernel and
correctness test.
cc zdevito apaszke
LSTM forward pass, 1 layer, 512 hidden size and input size, 100 seq length, requires_grad=False on all inputs and weights.
After changes:
```
thnn cudnn jit
8.8468 6.5797 9.3470
```
Before changes:
```
thnn cudnn jit
9.9221 6.6539 11.2550
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/10178
Differential Revision: D9382661
Pulled By: zou3519
fbshipit-source-id: 1f8a749208fbdd45559775ce98cf4eb9558448f8
2018-08-18 22:54:25 +00:00
|
|
|
|
2017-08-01 04:46:41 +00:00
|
|
|
// in places where op can be fused into a consumer but chunk is in the way
|
|
|
|
|
// distribute chunk to op's operands:
|
|
|
|
|
// replace a,b = chunk(op(x,y,z)) with:
|
Allow graph fuser to move chunks past multiple nodes. (#14055)
Summary:
Fixes #12290. Also speeds up JIT LSTM forward pass from 8.8ms to 7.8ms; previously, each JIT lstm cell used 2 fused kernels. Now, it only uses one fused kernel (which is how many kernels cudnn uses).
Explanation:
Let f, g, h be fusible ops.
```
x = f(v, w)
z = g(x, y)
a, b = chunk(z)
c = h(a, b)
```
becomes (before this PR):
```
x = f(v, w)
x', y' = broadcast_tensors([x, y])
ax, bx = chunk(x')
ay, by = chunk(y')
a = g(ax, ay)
b = g(bx, by)
c = h(a, b)
```
The graph fuser then puts g, g, and h into one FusionGroup and is unable
to move `x = f(v, w)` into the FusionGroup.
This PR lets the graph fuser move `x = f(v, w)` into the FusionGroup.
It does this by abstracting the broadcast_tensors + multiple chunk nodes
into one intermediate `prim::BroadcastingChunk[chunks, dim]` node.
A `BroadcastingChunk[chunks, dim](*inputs)` node is equivalent to:
- broadcasting all of *inputs
- chunk-ing each broadcasted input into `chunks` chunks along dim `dim`.
Abstracting the broadcasting chunk behavior away, it is now a lot easier
for the graph fuser to move (broadcast + chunk) past an operation. After
this PR, the above graph becomes:
```
x = f(v, w)
ax, bx, ay, by = BroadcastingChunk(x, y)
a = g(ax, ay)
b = g(bx, by)
c = h(a, b)
```
Now, to move `x = f(v, w)` after the BroadcastingChunk, one just needs
to add f's operands to the BroadcastingChunk:
```
ay, by, av, bv, aw, bw = BroadcastingChunk(y, v, w)
ax = f(av, aw)
by = f(bv, bw)
a = g(ax, ay)
b = g(bx, by)
c = h(a, b)
```
cc apaszke mruberry zdevito
Pull Request resolved: https://github.com/pytorch/pytorch/pull/14055
Differential Revision: D13159259
Pulled By: zou3519
fbshipit-source-id: 134e9e645c950384d9be6a06a883a10e17a73d7d
2018-11-26 20:28:44 +00:00
|
|
|
// x', y', z' = broadcast_tensors([x, y, z])
|
|
|
|
|
// x0,x1 = chunk(x') (x0 has a's type, x1 has b's type)
|
|
|
|
|
// y0,y1 = chunk(y') (y0 has a's type, y1 has b's type)
|
|
|
|
|
// z0,z1 = chunk(z') (z0 has a's type, z1 has b's type)
|
2017-08-09 04:17:32 +00:00
|
|
|
// a = op(x0,y0,z0) (a,b have their same size but are now contiguous)
|
2017-08-01 04:46:41 +00:00
|
|
|
// b = op(x1,y1,x1)
|
Make cloneFrom() copy all metadata; use createClone() as much as possible.
To be honest, this was the whole point of this refactor set.
I noticed that in a lot of code, we were repeatedly copying lots of metadata
from old nodes to new nodes. This was quite concerning because I wanted to
add some more metadata (alias information) and I didn't want to have to
get it right in all cases. Plus, in a lot of cases we were forgetting
to set more optional properties like debug names when we "copied".
To solve this, I first made cloneFrom() copy all of this metadata. Then,
I searched for all occurrences of setType() (a proxy for "I'm cloning this
node), looked for cases where we really were morally doing a copy, and rewrote
the code to use cloneFrom() instead, allowing us to drop explicit setType()
(and getting more metadata preservation in the process.)
Finally, I refactored tryToMoveChunk. The code is modestly longer,
but the new version has the nice property that the initialization of
selects for input_chunk are next to the creation of the node (as opposed
to delayed for later.) I also added a lot more comments for invariants
I noticed when I was working on the code.
One minor extra change: TensorType grew a new constructor and a withSizesStride
"immutable setter" which returns a new copy of TensorType with different info.
Signed-off-by: Edward Z. Yang <ezyang@fb.com>
2017-09-14 04:57:05 +00:00
|
|
|
//
|
Allow graph fuser to move chunks past multiple nodes. (#14055)
Summary:
Fixes #12290. Also speeds up JIT LSTM forward pass from 8.8ms to 7.8ms; previously, each JIT lstm cell used 2 fused kernels. Now, it only uses one fused kernel (which is how many kernels cudnn uses).
Explanation:
Let f, g, h be fusible ops.
```
x = f(v, w)
z = g(x, y)
a, b = chunk(z)
c = h(a, b)
```
becomes (before this PR):
```
x = f(v, w)
x', y' = broadcast_tensors([x, y])
ax, bx = chunk(x')
ay, by = chunk(y')
a = g(ax, ay)
b = g(bx, by)
c = h(a, b)
```
The graph fuser then puts g, g, and h into one FusionGroup and is unable
to move `x = f(v, w)` into the FusionGroup.
This PR lets the graph fuser move `x = f(v, w)` into the FusionGroup.
It does this by abstracting the broadcast_tensors + multiple chunk nodes
into one intermediate `prim::BroadcastingChunk[chunks, dim]` node.
A `BroadcastingChunk[chunks, dim](*inputs)` node is equivalent to:
- broadcasting all of *inputs
- chunk-ing each broadcasted input into `chunks` chunks along dim `dim`.
Abstracting the broadcasting chunk behavior away, it is now a lot easier
for the graph fuser to move (broadcast + chunk) past an operation. After
this PR, the above graph becomes:
```
x = f(v, w)
ax, bx, ay, by = BroadcastingChunk(x, y)
a = g(ax, ay)
b = g(bx, by)
c = h(a, b)
```
Now, to move `x = f(v, w)` after the BroadcastingChunk, one just needs
to add f's operands to the BroadcastingChunk:
```
ay, by, av, bv, aw, bw = BroadcastingChunk(y, v, w)
ax = f(av, aw)
by = f(bv, bw)
a = g(ax, ay)
b = g(bx, by)
c = h(a, b)
```
cc apaszke mruberry zdevito
Pull Request resolved: https://github.com/pytorch/pytorch/pull/14055
Differential Revision: D13159259
Pulled By: zou3519
fbshipit-source-id: 134e9e645c950384d9be6a06a883a10e17a73d7d
2018-11-26 20:28:44 +00:00
|
|
|
// The graph fuser uses an intermediate prim::BroadcastingChunk node to
|
|
|
|
|
// represent this behavior concisely. BroadcastingChunk(x, y, z) broadcasts
|
|
|
|
|
// all of its inputs and then chunks each input, in order, the same way.
|
|
|
|
|
// The above graph is equivalent to:
|
|
|
|
|
// x0, x1, y0, y1, z0, z1 = BroadcastingChunk(x, y, z)
|
|
|
|
|
// a = op(x0,y0,z0)
|
|
|
|
|
// b = op(x1,y1,x1)
|
|
|
|
|
//
|
|
|
|
|
// NB: The explicit broadcast is important for correctness.
|
|
|
|
|
// Let's say we have:
|
|
|
|
|
// %z = aten::mul(%x, %y)
|
|
|
|
|
// %z.1, %z.2 = aten::chunk(%z, ...)
|
|
|
|
|
// ... = prim::FusionGroup(%z.1, %z.2, ...)
|
|
|
|
|
// It's possible that %x and %y do not have the same size as %z and
|
|
|
|
|
// need to be expanded first so that they can be chunked like %z
|
|
|
|
|
//
|
Make cloneFrom() copy all metadata; use createClone() as much as possible.
To be honest, this was the whole point of this refactor set.
I noticed that in a lot of code, we were repeatedly copying lots of metadata
from old nodes to new nodes. This was quite concerning because I wanted to
add some more metadata (alias information) and I didn't want to have to
get it right in all cases. Plus, in a lot of cases we were forgetting
to set more optional properties like debug names when we "copied".
To solve this, I first made cloneFrom() copy all of this metadata. Then,
I searched for all occurrences of setType() (a proxy for "I'm cloning this
node), looked for cases where we really were morally doing a copy, and rewrote
the code to use cloneFrom() instead, allowing us to drop explicit setType()
(and getting more metadata preservation in the process.)
Finally, I refactored tryToMoveChunk. The code is modestly longer,
but the new version has the nice property that the initialization of
selects for input_chunk are next to the creation of the node (as opposed
to delayed for later.) I also added a lot more comments for invariants
I noticed when I was working on the code.
One minor extra change: TensorType grew a new constructor and a withSizesStride
"immutable setter" which returns a new copy of TensorType with different info.
Signed-off-by: Edward Z. Yang <ezyang@fb.com>
2017-09-14 04:57:05 +00:00
|
|
|
// NB: Chunk motion only occurs with fusable consumers, which implies
|
|
|
|
|
// that there is always some other operation, e.g., a+b, that happens
|
2017-09-22 00:32:23 +00:00
|
|
|
// after the chunk, and will be put into the fusion group. This is
|
Make cloneFrom() copy all metadata; use createClone() as much as possible.
To be honest, this was the whole point of this refactor set.
I noticed that in a lot of code, we were repeatedly copying lots of metadata
from old nodes to new nodes. This was quite concerning because I wanted to
add some more metadata (alias information) and I didn't want to have to
get it right in all cases. Plus, in a lot of cases we were forgetting
to set more optional properties like debug names when we "copied".
To solve this, I first made cloneFrom() copy all of this metadata. Then,
I searched for all occurrences of setType() (a proxy for "I'm cloning this
node), looked for cases where we really were morally doing a copy, and rewrote
the code to use cloneFrom() instead, allowing us to drop explicit setType()
(and getting more metadata preservation in the process.)
Finally, I refactored tryToMoveChunk. The code is modestly longer,
but the new version has the nice property that the initialization of
selects for input_chunk are next to the creation of the node (as opposed
to delayed for later.) I also added a lot more comments for invariants
I noticed when I was working on the code.
One minor extra change: TensorType grew a new constructor and a withSizesStride
"immutable setter" which returns a new copy of TensorType with different info.
Signed-off-by: Edward Z. Yang <ezyang@fb.com>
2017-09-14 04:57:05 +00:00
|
|
|
// important, because distributing the chunk changes the contiguity
|
|
|
|
|
// of a and b, and so the results would be invalid, except that we know
|
|
|
|
|
// that simple_mappable operations will restore contiguity before
|
|
|
|
|
// we exit the fusion group.
|
Allow graph fuser to move chunks past multiple nodes. (#14055)
Summary:
Fixes #12290. Also speeds up JIT LSTM forward pass from 8.8ms to 7.8ms; previously, each JIT lstm cell used 2 fused kernels. Now, it only uses one fused kernel (which is how many kernels cudnn uses).
Explanation:
Let f, g, h be fusible ops.
```
x = f(v, w)
z = g(x, y)
a, b = chunk(z)
c = h(a, b)
```
becomes (before this PR):
```
x = f(v, w)
x', y' = broadcast_tensors([x, y])
ax, bx = chunk(x')
ay, by = chunk(y')
a = g(ax, ay)
b = g(bx, by)
c = h(a, b)
```
The graph fuser then puts g, g, and h into one FusionGroup and is unable
to move `x = f(v, w)` into the FusionGroup.
This PR lets the graph fuser move `x = f(v, w)` into the FusionGroup.
It does this by abstracting the broadcast_tensors + multiple chunk nodes
into one intermediate `prim::BroadcastingChunk[chunks, dim]` node.
A `BroadcastingChunk[chunks, dim](*inputs)` node is equivalent to:
- broadcasting all of *inputs
- chunk-ing each broadcasted input into `chunks` chunks along dim `dim`.
Abstracting the broadcasting chunk behavior away, it is now a lot easier
for the graph fuser to move (broadcast + chunk) past an operation. After
this PR, the above graph becomes:
```
x = f(v, w)
ax, bx, ay, by = BroadcastingChunk(x, y)
a = g(ax, ay)
b = g(bx, by)
c = h(a, b)
```
Now, to move `x = f(v, w)` after the BroadcastingChunk, one just needs
to add f's operands to the BroadcastingChunk:
```
ay, by, av, bv, aw, bw = BroadcastingChunk(y, v, w)
ax = f(av, aw)
by = f(bv, bw)
a = g(ax, ay)
b = g(bx, by)
c = h(a, b)
```
cc apaszke mruberry zdevito
Pull Request resolved: https://github.com/pytorch/pytorch/pull/14055
Differential Revision: D13159259
Pulled By: zou3519
fbshipit-source-id: 134e9e645c950384d9be6a06a883a10e17a73d7d
2018-11-26 20:28:44 +00:00
|
|
|
//
|
|
|
|
|
// NB: The intermediate BroadcastingChunk is important for moving chunks past
|
2018-12-26 14:52:25 +00:00
|
|
|
// more than one operation: the graph fuser is not able to easily move
|
|
|
|
|
// operations around broadcast_tensors + chunk nodes. Let f, g, h be fusible
|
|
|
|
|
// ops
|
Allow graph fuser to move chunks past multiple nodes. (#14055)
Summary:
Fixes #12290. Also speeds up JIT LSTM forward pass from 8.8ms to 7.8ms; previously, each JIT lstm cell used 2 fused kernels. Now, it only uses one fused kernel (which is how many kernels cudnn uses).
Explanation:
Let f, g, h be fusible ops.
```
x = f(v, w)
z = g(x, y)
a, b = chunk(z)
c = h(a, b)
```
becomes (before this PR):
```
x = f(v, w)
x', y' = broadcast_tensors([x, y])
ax, bx = chunk(x')
ay, by = chunk(y')
a = g(ax, ay)
b = g(bx, by)
c = h(a, b)
```
The graph fuser then puts g, g, and h into one FusionGroup and is unable
to move `x = f(v, w)` into the FusionGroup.
This PR lets the graph fuser move `x = f(v, w)` into the FusionGroup.
It does this by abstracting the broadcast_tensors + multiple chunk nodes
into one intermediate `prim::BroadcastingChunk[chunks, dim]` node.
A `BroadcastingChunk[chunks, dim](*inputs)` node is equivalent to:
- broadcasting all of *inputs
- chunk-ing each broadcasted input into `chunks` chunks along dim `dim`.
Abstracting the broadcasting chunk behavior away, it is now a lot easier
for the graph fuser to move (broadcast + chunk) past an operation. After
this PR, the above graph becomes:
```
x = f(v, w)
ax, bx, ay, by = BroadcastingChunk(x, y)
a = g(ax, ay)
b = g(bx, by)
c = h(a, b)
```
Now, to move `x = f(v, w)` after the BroadcastingChunk, one just needs
to add f's operands to the BroadcastingChunk:
```
ay, by, av, bv, aw, bw = BroadcastingChunk(y, v, w)
ax = f(av, aw)
by = f(bv, bw)
a = g(ax, ay)
b = g(bx, by)
c = h(a, b)
```
cc apaszke mruberry zdevito
Pull Request resolved: https://github.com/pytorch/pytorch/pull/14055
Differential Revision: D13159259
Pulled By: zou3519
fbshipit-source-id: 134e9e645c950384d9be6a06a883a10e17a73d7d
2018-11-26 20:28:44 +00:00
|
|
|
// x = f(v, w)
|
|
|
|
|
// z = g(x, y)
|
|
|
|
|
// a, b = chunk(z)
|
|
|
|
|
// c = h(a, b)
|
|
|
|
|
// becomes (with the broadcast_tensors + chunk approach):
|
|
|
|
|
// x = f(v, w)
|
|
|
|
|
// x', y' = broadcast_tensors([x, y])
|
|
|
|
|
// ax, bx = chunk(x')
|
|
|
|
|
// ay, by = chunk(y')
|
|
|
|
|
// a = g(ax, ay)
|
|
|
|
|
// b = g(bx, by)
|
|
|
|
|
// c = h(a, b)
|
|
|
|
|
// The broadcast_tensors node makes it harder to move f into the resulting
|
|
|
|
|
// FusionGroup of g, g, and h. Keeping the broadcasting and chunk behavior
|
|
|
|
|
// together results in:
|
|
|
|
|
// x = f(v, w)
|
|
|
|
|
// ax, bx, ay, by = BroadcastingChunk(x, y)
|
|
|
|
|
// a = g(ax, ay)
|
|
|
|
|
// b = g(bx, by)
|
|
|
|
|
// c = h(a, b)
|
|
|
|
|
// making it easier to move f after the BroadcastingChunk:
|
|
|
|
|
// ay, by, av, bv, aw, bw = BroadcastingChunk(y, v, w)
|
|
|
|
|
// ax = f(av, aw)
|
|
|
|
|
// by = f(bv, bw)
|
|
|
|
|
// a = g(ax, ay)
|
|
|
|
|
// b = g(bx, by)
|
|
|
|
|
// c = h(a, b)
|
2017-08-01 04:46:41 +00:00
|
|
|
|
2018-12-26 14:52:25 +00:00
|
|
|
bool tryToMoveChunk(Node* consumer, Value* producer) {
|
Allow graph fuser to move chunks past multiple nodes. (#14055)
Summary:
Fixes #12290. Also speeds up JIT LSTM forward pass from 8.8ms to 7.8ms; previously, each JIT lstm cell used 2 fused kernels. Now, it only uses one fused kernel (which is how many kernels cudnn uses).
Explanation:
Let f, g, h be fusible ops.
```
x = f(v, w)
z = g(x, y)
a, b = chunk(z)
c = h(a, b)
```
becomes (before this PR):
```
x = f(v, w)
x', y' = broadcast_tensors([x, y])
ax, bx = chunk(x')
ay, by = chunk(y')
a = g(ax, ay)
b = g(bx, by)
c = h(a, b)
```
The graph fuser then puts g, g, and h into one FusionGroup and is unable
to move `x = f(v, w)` into the FusionGroup.
This PR lets the graph fuser move `x = f(v, w)` into the FusionGroup.
It does this by abstracting the broadcast_tensors + multiple chunk nodes
into one intermediate `prim::BroadcastingChunk[chunks, dim]` node.
A `BroadcastingChunk[chunks, dim](*inputs)` node is equivalent to:
- broadcasting all of *inputs
- chunk-ing each broadcasted input into `chunks` chunks along dim `dim`.
Abstracting the broadcasting chunk behavior away, it is now a lot easier
for the graph fuser to move (broadcast + chunk) past an operation. After
this PR, the above graph becomes:
```
x = f(v, w)
ax, bx, ay, by = BroadcastingChunk(x, y)
a = g(ax, ay)
b = g(bx, by)
c = h(a, b)
```
Now, to move `x = f(v, w)` after the BroadcastingChunk, one just needs
to add f's operands to the BroadcastingChunk:
```
ay, by, av, bv, aw, bw = BroadcastingChunk(y, v, w)
ax = f(av, aw)
by = f(bv, bw)
a = g(ax, ay)
b = g(bx, by)
c = h(a, b)
```
cc apaszke mruberry zdevito
Pull Request resolved: https://github.com/pytorch/pytorch/pull/14055
Differential Revision: D13159259
Pulled By: zou3519
fbshipit-source-id: 134e9e645c950384d9be6a06a883a10e17a73d7d
2018-11-26 20:28:44 +00:00
|
|
|
// is the output from a chunk/bchunk node?
|
2018-12-26 14:52:25 +00:00
|
|
|
auto* chunk = producer->node();
|
|
|
|
|
if (chunk->kind() != prim::ConstantChunk &&
|
|
|
|
|
chunk->kind() != prim::BroadcastingChunk)
|
2017-08-01 04:46:41 +00:00
|
|
|
return false;
|
Allow graph fuser to move chunks past multiple nodes. (#14055)
Summary:
Fixes #12290. Also speeds up JIT LSTM forward pass from 8.8ms to 7.8ms; previously, each JIT lstm cell used 2 fused kernels. Now, it only uses one fused kernel (which is how many kernels cudnn uses).
Explanation:
Let f, g, h be fusible ops.
```
x = f(v, w)
z = g(x, y)
a, b = chunk(z)
c = h(a, b)
```
becomes (before this PR):
```
x = f(v, w)
x', y' = broadcast_tensors([x, y])
ax, bx = chunk(x')
ay, by = chunk(y')
a = g(ax, ay)
b = g(bx, by)
c = h(a, b)
```
The graph fuser then puts g, g, and h into one FusionGroup and is unable
to move `x = f(v, w)` into the FusionGroup.
This PR lets the graph fuser move `x = f(v, w)` into the FusionGroup.
It does this by abstracting the broadcast_tensors + multiple chunk nodes
into one intermediate `prim::BroadcastingChunk[chunks, dim]` node.
A `BroadcastingChunk[chunks, dim](*inputs)` node is equivalent to:
- broadcasting all of *inputs
- chunk-ing each broadcasted input into `chunks` chunks along dim `dim`.
Abstracting the broadcasting chunk behavior away, it is now a lot easier
for the graph fuser to move (broadcast + chunk) past an operation. After
this PR, the above graph becomes:
```
x = f(v, w)
ax, bx, ay, by = BroadcastingChunk(x, y)
a = g(ax, ay)
b = g(bx, by)
c = h(a, b)
```
Now, to move `x = f(v, w)` after the BroadcastingChunk, one just needs
to add f's operands to the BroadcastingChunk:
```
ay, by, av, bv, aw, bw = BroadcastingChunk(y, v, w)
ax = f(av, aw)
by = f(bv, bw)
a = g(ax, ay)
b = g(bx, by)
c = h(a, b)
```
cc apaszke mruberry zdevito
Pull Request resolved: https://github.com/pytorch/pytorch/pull/14055
Differential Revision: D13159259
Pulled By: zou3519
fbshipit-source-id: 134e9e645c950384d9be6a06a883a10e17a73d7d
2018-11-26 20:28:44 +00:00
|
|
|
|
2018-12-26 14:52:25 +00:00
|
|
|
// try to find a producer to move after the chunk/bchunk. The producer must
|
|
|
|
|
// be fusible into the consumer.
|
Allow graph fuser to move chunks past multiple nodes. (#14055)
Summary:
Fixes #12290. Also speeds up JIT LSTM forward pass from 8.8ms to 7.8ms; previously, each JIT lstm cell used 2 fused kernels. Now, it only uses one fused kernel (which is how many kernels cudnn uses).
Explanation:
Let f, g, h be fusible ops.
```
x = f(v, w)
z = g(x, y)
a, b = chunk(z)
c = h(a, b)
```
becomes (before this PR):
```
x = f(v, w)
x', y' = broadcast_tensors([x, y])
ax, bx = chunk(x')
ay, by = chunk(y')
a = g(ax, ay)
b = g(bx, by)
c = h(a, b)
```
The graph fuser then puts g, g, and h into one FusionGroup and is unable
to move `x = f(v, w)` into the FusionGroup.
This PR lets the graph fuser move `x = f(v, w)` into the FusionGroup.
It does this by abstracting the broadcast_tensors + multiple chunk nodes
into one intermediate `prim::BroadcastingChunk[chunks, dim]` node.
A `BroadcastingChunk[chunks, dim](*inputs)` node is equivalent to:
- broadcasting all of *inputs
- chunk-ing each broadcasted input into `chunks` chunks along dim `dim`.
Abstracting the broadcasting chunk behavior away, it is now a lot easier
for the graph fuser to move (broadcast + chunk) past an operation. After
this PR, the above graph becomes:
```
x = f(v, w)
ax, bx, ay, by = BroadcastingChunk(x, y)
a = g(ax, ay)
b = g(bx, by)
c = h(a, b)
```
Now, to move `x = f(v, w)` after the BroadcastingChunk, one just needs
to add f's operands to the BroadcastingChunk:
```
ay, by, av, bv, aw, bw = BroadcastingChunk(y, v, w)
ax = f(av, aw)
by = f(bv, bw)
a = g(ax, ay)
b = g(bx, by)
c = h(a, b)
```
cc apaszke mruberry zdevito
Pull Request resolved: https://github.com/pytorch/pytorch/pull/14055
Differential Revision: D13159259
Pulled By: zou3519
fbshipit-source-id: 134e9e645c950384d9be6a06a883a10e17a73d7d
2018-11-26 20:28:44 +00:00
|
|
|
auto it = std::find_if(
|
|
|
|
|
chunk->inputs().begin(),
|
|
|
|
|
chunk->inputs().end(),
|
2018-12-26 14:52:25 +00:00
|
|
|
[&](Value* producer_for_chunk) {
|
2019-01-10 20:25:22 +00:00
|
|
|
return isFusableMap(producer_for_chunk->node()) &&
|
2018-12-04 23:40:41 +00:00
|
|
|
allUsersAreThisConsumerOrCalcSizes(chunk, producer_for_chunk);
|
Allow graph fuser to move chunks past multiple nodes. (#14055)
Summary:
Fixes #12290. Also speeds up JIT LSTM forward pass from 8.8ms to 7.8ms; previously, each JIT lstm cell used 2 fused kernels. Now, it only uses one fused kernel (which is how many kernels cudnn uses).
Explanation:
Let f, g, h be fusible ops.
```
x = f(v, w)
z = g(x, y)
a, b = chunk(z)
c = h(a, b)
```
becomes (before this PR):
```
x = f(v, w)
x', y' = broadcast_tensors([x, y])
ax, bx = chunk(x')
ay, by = chunk(y')
a = g(ax, ay)
b = g(bx, by)
c = h(a, b)
```
The graph fuser then puts g, g, and h into one FusionGroup and is unable
to move `x = f(v, w)` into the FusionGroup.
This PR lets the graph fuser move `x = f(v, w)` into the FusionGroup.
It does this by abstracting the broadcast_tensors + multiple chunk nodes
into one intermediate `prim::BroadcastingChunk[chunks, dim]` node.
A `BroadcastingChunk[chunks, dim](*inputs)` node is equivalent to:
- broadcasting all of *inputs
- chunk-ing each broadcasted input into `chunks` chunks along dim `dim`.
Abstracting the broadcasting chunk behavior away, it is now a lot easier
for the graph fuser to move (broadcast + chunk) past an operation. After
this PR, the above graph becomes:
```
x = f(v, w)
ax, bx, ay, by = BroadcastingChunk(x, y)
a = g(ax, ay)
b = g(bx, by)
c = h(a, b)
```
Now, to move `x = f(v, w)` after the BroadcastingChunk, one just needs
to add f's operands to the BroadcastingChunk:
```
ay, by, av, bv, aw, bw = BroadcastingChunk(y, v, w)
ax = f(av, aw)
by = f(bv, bw)
a = g(ax, ay)
b = g(bx, by)
c = h(a, b)
```
cc apaszke mruberry zdevito
Pull Request resolved: https://github.com/pytorch/pytorch/pull/14055
Differential Revision: D13159259
Pulled By: zou3519
fbshipit-source-id: 134e9e645c950384d9be6a06a883a10e17a73d7d
2018-11-26 20:28:44 +00:00
|
|
|
});
|
|
|
|
|
if (it == chunk->inputs().end()) {
|
2017-08-01 04:46:41 +00:00
|
|
|
return false;
|
Allow graph fuser to move chunks past multiple nodes. (#14055)
Summary:
Fixes #12290. Also speeds up JIT LSTM forward pass from 8.8ms to 7.8ms; previously, each JIT lstm cell used 2 fused kernels. Now, it only uses one fused kernel (which is how many kernels cudnn uses).
Explanation:
Let f, g, h be fusible ops.
```
x = f(v, w)
z = g(x, y)
a, b = chunk(z)
c = h(a, b)
```
becomes (before this PR):
```
x = f(v, w)
x', y' = broadcast_tensors([x, y])
ax, bx = chunk(x')
ay, by = chunk(y')
a = g(ax, ay)
b = g(bx, by)
c = h(a, b)
```
The graph fuser then puts g, g, and h into one FusionGroup and is unable
to move `x = f(v, w)` into the FusionGroup.
This PR lets the graph fuser move `x = f(v, w)` into the FusionGroup.
It does this by abstracting the broadcast_tensors + multiple chunk nodes
into one intermediate `prim::BroadcastingChunk[chunks, dim]` node.
A `BroadcastingChunk[chunks, dim](*inputs)` node is equivalent to:
- broadcasting all of *inputs
- chunk-ing each broadcasted input into `chunks` chunks along dim `dim`.
Abstracting the broadcasting chunk behavior away, it is now a lot easier
for the graph fuser to move (broadcast + chunk) past an operation. After
this PR, the above graph becomes:
```
x = f(v, w)
ax, bx, ay, by = BroadcastingChunk(x, y)
a = g(ax, ay)
b = g(bx, by)
c = h(a, b)
```
Now, to move `x = f(v, w)` after the BroadcastingChunk, one just needs
to add f's operands to the BroadcastingChunk:
```
ay, by, av, bv, aw, bw = BroadcastingChunk(y, v, w)
ax = f(av, aw)
by = f(bv, bw)
a = g(ax, ay)
b = g(bx, by)
c = h(a, b)
```
cc apaszke mruberry zdevito
Pull Request resolved: https://github.com/pytorch/pytorch/pull/14055
Differential Revision: D13159259
Pulled By: zou3519
fbshipit-source-id: 134e9e645c950384d9be6a06a883a10e17a73d7d
2018-11-26 20:28:44 +00:00
|
|
|
}
|
2018-12-26 14:52:25 +00:00
|
|
|
Value* producer_for_chunk = *it;
|
Allow graph fuser to move chunks past multiple nodes. (#14055)
Summary:
Fixes #12290. Also speeds up JIT LSTM forward pass from 8.8ms to 7.8ms; previously, each JIT lstm cell used 2 fused kernels. Now, it only uses one fused kernel (which is how many kernels cudnn uses).
Explanation:
Let f, g, h be fusible ops.
```
x = f(v, w)
z = g(x, y)
a, b = chunk(z)
c = h(a, b)
```
becomes (before this PR):
```
x = f(v, w)
x', y' = broadcast_tensors([x, y])
ax, bx = chunk(x')
ay, by = chunk(y')
a = g(ax, ay)
b = g(bx, by)
c = h(a, b)
```
The graph fuser then puts g, g, and h into one FusionGroup and is unable
to move `x = f(v, w)` into the FusionGroup.
This PR lets the graph fuser move `x = f(v, w)` into the FusionGroup.
It does this by abstracting the broadcast_tensors + multiple chunk nodes
into one intermediate `prim::BroadcastingChunk[chunks, dim]` node.
A `BroadcastingChunk[chunks, dim](*inputs)` node is equivalent to:
- broadcasting all of *inputs
- chunk-ing each broadcasted input into `chunks` chunks along dim `dim`.
Abstracting the broadcasting chunk behavior away, it is now a lot easier
for the graph fuser to move (broadcast + chunk) past an operation. After
this PR, the above graph becomes:
```
x = f(v, w)
ax, bx, ay, by = BroadcastingChunk(x, y)
a = g(ax, ay)
b = g(bx, by)
c = h(a, b)
```
Now, to move `x = f(v, w)` after the BroadcastingChunk, one just needs
to add f's operands to the BroadcastingChunk:
```
ay, by, av, bv, aw, bw = BroadcastingChunk(y, v, w)
ax = f(av, aw)
by = f(bv, bw)
a = g(ax, ay)
b = g(bx, by)
c = h(a, b)
```
cc apaszke mruberry zdevito
Pull Request resolved: https://github.com/pytorch/pytorch/pull/14055
Differential Revision: D13159259
Pulled By: zou3519
fbshipit-source-id: 134e9e645c950384d9be6a06a883a10e17a73d7d
2018-11-26 20:28:44 +00:00
|
|
|
size_t producer_index = it - chunk->inputs().begin();
|
|
|
|
|
|
|
|
|
|
// all uses of the chunk must be in in this consumer
|
Refactor ir.h to distinguish Nodes and Values
This commit adds a Value type similar to the one @ezyang suggested a while
ago for handling multi-return nodes.
Previously if we had a graph like:
a = op1(b)
c, d = op2(a)
Then its in-memory format would look like:
%0 = op1(b)
%1 = op2(%0)
%2 = select(%1, 0)
%2 = select(%1, 1)
Select nodes were used only to handle the multi-output case. In the
single-output case ops referred directly to their uses.
This required special handling for the single- and multi- output cases,
and was confusing when used with ONNX which distinguishes values (the
inputs/outputs of a node) from the nodes themselves (e.g. a Conv).
This commit adds the Node/Value distinction to the IR. In the example
above, `a`, `b`, `c`, and `d` are now Value objects, while `op1` and
`op2` are now Node objects. Inputs/Outputs to the graph are values.
* Nodes now always have multiple outputs, accessible through their `output()`
method.
* Methods exist for adding/removing outputs from a node.
* Nodes own their output Values, destroying a node destroys its outputs and it
is only valid to destroy a node when no uses of its outputs remain.
* Unlike select, Values do not appear in the nodes list.
* The method `node()` on `Value` retrieves its defining node. Calling it
is always valid. For inputs, its kind is "Param". Like "Return" there is a single Param
node representing all inputs.
* For single-output Nodes, the method `output()` retrieves the single
output Value, asserting that the node is in-fact single output.
* Functions are the same, but some functions like `type()` have moved to
Value.
* `replaceAllUsesWith` is now sanely defined for both Values and Nodes.
In the case of Nodes, it replaces all outputs of the node with the outputs
of the replacement node.
* stage is defined both on Node/Value. This is because Inputs require a stage.
* Apart from changing data types from Node->Value most passes remain the same.
Things that previously assumed single-output nodes now have to call output()
to get the node.
* This removes the uses = [...] field in the outputs because it was
getting confusing even before this commit when uses would refer to nodes,
but we print the names of Values. The lint pass validates the use list,
so printing it out seems less necessary.
2017-11-14 11:05:32 +00:00
|
|
|
for (auto s : chunk->outputs()) {
|
|
|
|
|
for (auto u : s->uses()) {
|
2017-09-08 14:44:14 +00:00
|
|
|
if (u.user != consumer)
|
2017-08-01 04:46:41 +00:00
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
}
|
Prevent JIT from overspecializing to every single size configuration (#10844)
Summary:
Please review the expects carefully to make sure there are no regressions. I tried to go over them one by one when they changed, but it's sometimes easy to miss finer details.
Summary of changes:
- Renamed `TensorType` to `CompleteTensorType`. Added a new `TensorType` which records only the scalar type, number of dimensions, and device of a value. The argument behind the rename is to encourage people to use `CompleteTensorType` less, as most passes will only have limited information available. To make transition easier `complete_type->cast<TensorType>()` works, and makes our passes work with both kinds of specialization if they don't need extra the extra detail.
- Renamed `ArgumentSpec` to `CompleteArgumentSpec`. Added a new `ArgumentSpec`, which matches argument only at the level of the new `TensorType`.
- Shape analysis can process graphs with both `CompleteTensorType` and `TensorType`.
- Fuser was a part that heavily relied on full shape information being available. Now, we simply try to fuse the largest possible graphs, and have to do run-time checks to make sure they match the code we generate. If they don't, we fall back to regular interpretation. The shape checks are implementing using an optimized method exploiting algebraic properties of shapes with broadcasting, and the relations of broadcasting with pointwise ops. A full written proof of correctness of the shape checking algorithm is included in a comment in `graph_fuser.cpp`.
zdevito ezyang mruberry ngimel csarofeen
Pull Request resolved: https://github.com/pytorch/pytorch/pull/10844
Differential Revision: D9498705
Pulled By: apaszke
fbshipit-source-id: 0c53c2fcebd871cc2a29c260f8d012276479cc61
2018-08-26 16:40:58 +00:00
|
|
|
// multiple return operators
|
2018-12-26 14:52:25 +00:00
|
|
|
Node* producer_for_chunk_node = producer_for_chunk->node();
|
2019-01-24 19:05:07 +00:00
|
|
|
AT_ASSERT(producer_for_chunk_node->outputs().size() == 1);
|
2017-08-01 04:46:41 +00:00
|
|
|
|
Allow graph fuser to move chunks past multiple nodes. (#14055)
Summary:
Fixes #12290. Also speeds up JIT LSTM forward pass from 8.8ms to 7.8ms; previously, each JIT lstm cell used 2 fused kernels. Now, it only uses one fused kernel (which is how many kernels cudnn uses).
Explanation:
Let f, g, h be fusible ops.
```
x = f(v, w)
z = g(x, y)
a, b = chunk(z)
c = h(a, b)
```
becomes (before this PR):
```
x = f(v, w)
x', y' = broadcast_tensors([x, y])
ax, bx = chunk(x')
ay, by = chunk(y')
a = g(ax, ay)
b = g(bx, by)
c = h(a, b)
```
The graph fuser then puts g, g, and h into one FusionGroup and is unable
to move `x = f(v, w)` into the FusionGroup.
This PR lets the graph fuser move `x = f(v, w)` into the FusionGroup.
It does this by abstracting the broadcast_tensors + multiple chunk nodes
into one intermediate `prim::BroadcastingChunk[chunks, dim]` node.
A `BroadcastingChunk[chunks, dim](*inputs)` node is equivalent to:
- broadcasting all of *inputs
- chunk-ing each broadcasted input into `chunks` chunks along dim `dim`.
Abstracting the broadcasting chunk behavior away, it is now a lot easier
for the graph fuser to move (broadcast + chunk) past an operation. After
this PR, the above graph becomes:
```
x = f(v, w)
ax, bx, ay, by = BroadcastingChunk(x, y)
a = g(ax, ay)
b = g(bx, by)
c = h(a, b)
```
Now, to move `x = f(v, w)` after the BroadcastingChunk, one just needs
to add f's operands to the BroadcastingChunk:
```
ay, by, av, bv, aw, bw = BroadcastingChunk(y, v, w)
ax = f(av, aw)
by = f(bv, bw)
a = g(ax, ay)
b = g(bx, by)
c = h(a, b)
```
cc apaszke mruberry zdevito
Pull Request resolved: https://github.com/pytorch/pytorch/pull/14055
Differential Revision: D13159259
Pulled By: zou3519
fbshipit-source-id: 134e9e645c950384d9be6a06a883a10e17a73d7d
2018-11-26 20:28:44 +00:00
|
|
|
// Convert chunk to bchunk, if it isn't one already. The bchunk represents a
|
|
|
|
|
// broadcast and one or more chunk operations.
|
2018-12-26 14:52:25 +00:00
|
|
|
auto* bchunk = chunk;
|
Allow graph fuser to move chunks past multiple nodes. (#14055)
Summary:
Fixes #12290. Also speeds up JIT LSTM forward pass from 8.8ms to 7.8ms; previously, each JIT lstm cell used 2 fused kernels. Now, it only uses one fused kernel (which is how many kernels cudnn uses).
Explanation:
Let f, g, h be fusible ops.
```
x = f(v, w)
z = g(x, y)
a, b = chunk(z)
c = h(a, b)
```
becomes (before this PR):
```
x = f(v, w)
x', y' = broadcast_tensors([x, y])
ax, bx = chunk(x')
ay, by = chunk(y')
a = g(ax, ay)
b = g(bx, by)
c = h(a, b)
```
The graph fuser then puts g, g, and h into one FusionGroup and is unable
to move `x = f(v, w)` into the FusionGroup.
This PR lets the graph fuser move `x = f(v, w)` into the FusionGroup.
It does this by abstracting the broadcast_tensors + multiple chunk nodes
into one intermediate `prim::BroadcastingChunk[chunks, dim]` node.
A `BroadcastingChunk[chunks, dim](*inputs)` node is equivalent to:
- broadcasting all of *inputs
- chunk-ing each broadcasted input into `chunks` chunks along dim `dim`.
Abstracting the broadcasting chunk behavior away, it is now a lot easier
for the graph fuser to move (broadcast + chunk) past an operation. After
this PR, the above graph becomes:
```
x = f(v, w)
ax, bx, ay, by = BroadcastingChunk(x, y)
a = g(ax, ay)
b = g(bx, by)
c = h(a, b)
```
Now, to move `x = f(v, w)` after the BroadcastingChunk, one just needs
to add f's operands to the BroadcastingChunk:
```
ay, by, av, bv, aw, bw = BroadcastingChunk(y, v, w)
ax = f(av, aw)
by = f(bv, bw)
a = g(ax, ay)
b = g(bx, by)
c = h(a, b)
```
cc apaszke mruberry zdevito
Pull Request resolved: https://github.com/pytorch/pytorch/pull/14055
Differential Revision: D13159259
Pulled By: zou3519
fbshipit-source-id: 134e9e645c950384d9be6a06a883a10e17a73d7d
2018-11-26 20:28:44 +00:00
|
|
|
if (chunk->kind() == prim::ConstantChunk) {
|
|
|
|
|
bchunk = promoteChunkToBroadcastingChunk(chunk);
|
|
|
|
|
}
|
|
|
|
|
size_t nchunks = bchunk->i(attr::chunks);
|
|
|
|
|
WithInsertPoint guard(bchunk->next());
|
2018-08-17 22:46:06 +00:00
|
|
|
|
Allow graph fuser to move chunks past multiple nodes. (#14055)
Summary:
Fixes #12290. Also speeds up JIT LSTM forward pass from 8.8ms to 7.8ms; previously, each JIT lstm cell used 2 fused kernels. Now, it only uses one fused kernel (which is how many kernels cudnn uses).
Explanation:
Let f, g, h be fusible ops.
```
x = f(v, w)
z = g(x, y)
a, b = chunk(z)
c = h(a, b)
```
becomes (before this PR):
```
x = f(v, w)
x', y' = broadcast_tensors([x, y])
ax, bx = chunk(x')
ay, by = chunk(y')
a = g(ax, ay)
b = g(bx, by)
c = h(a, b)
```
The graph fuser then puts g, g, and h into one FusionGroup and is unable
to move `x = f(v, w)` into the FusionGroup.
This PR lets the graph fuser move `x = f(v, w)` into the FusionGroup.
It does this by abstracting the broadcast_tensors + multiple chunk nodes
into one intermediate `prim::BroadcastingChunk[chunks, dim]` node.
A `BroadcastingChunk[chunks, dim](*inputs)` node is equivalent to:
- broadcasting all of *inputs
- chunk-ing each broadcasted input into `chunks` chunks along dim `dim`.
Abstracting the broadcasting chunk behavior away, it is now a lot easier
for the graph fuser to move (broadcast + chunk) past an operation. After
this PR, the above graph becomes:
```
x = f(v, w)
ax, bx, ay, by = BroadcastingChunk(x, y)
a = g(ax, ay)
b = g(bx, by)
c = h(a, b)
```
Now, to move `x = f(v, w)` after the BroadcastingChunk, one just needs
to add f's operands to the BroadcastingChunk:
```
ay, by, av, bv, aw, bw = BroadcastingChunk(y, v, w)
ax = f(av, aw)
by = f(bv, bw)
a = g(ax, ay)
b = g(bx, by)
c = h(a, b)
```
cc apaszke mruberry zdevito
Pull Request resolved: https://github.com/pytorch/pytorch/pull/14055
Differential Revision: D13159259
Pulled By: zou3519
fbshipit-source-id: 134e9e645c950384d9be6a06a883a10e17a73d7d
2018-11-26 20:28:44 +00:00
|
|
|
std::vector<Value*> producer_chunk_outputs;
|
|
|
|
|
for (size_t i = 0; i < nchunks; i++) {
|
2018-12-26 14:52:25 +00:00
|
|
|
producer_chunk_outputs.push_back(
|
|
|
|
|
bchunk->output(nchunks * producer_index + i));
|
Allow graph fuser to move chunks past multiple nodes. (#14055)
Summary:
Fixes #12290. Also speeds up JIT LSTM forward pass from 8.8ms to 7.8ms; previously, each JIT lstm cell used 2 fused kernels. Now, it only uses one fused kernel (which is how many kernels cudnn uses).
Explanation:
Let f, g, h be fusible ops.
```
x = f(v, w)
z = g(x, y)
a, b = chunk(z)
c = h(a, b)
```
becomes (before this PR):
```
x = f(v, w)
x', y' = broadcast_tensors([x, y])
ax, bx = chunk(x')
ay, by = chunk(y')
a = g(ax, ay)
b = g(bx, by)
c = h(a, b)
```
The graph fuser then puts g, g, and h into one FusionGroup and is unable
to move `x = f(v, w)` into the FusionGroup.
This PR lets the graph fuser move `x = f(v, w)` into the FusionGroup.
It does this by abstracting the broadcast_tensors + multiple chunk nodes
into one intermediate `prim::BroadcastingChunk[chunks, dim]` node.
A `BroadcastingChunk[chunks, dim](*inputs)` node is equivalent to:
- broadcasting all of *inputs
- chunk-ing each broadcasted input into `chunks` chunks along dim `dim`.
Abstracting the broadcasting chunk behavior away, it is now a lot easier
for the graph fuser to move (broadcast + chunk) past an operation. After
this PR, the above graph becomes:
```
x = f(v, w)
ax, bx, ay, by = BroadcastingChunk(x, y)
a = g(ax, ay)
b = g(bx, by)
c = h(a, b)
```
Now, to move `x = f(v, w)` after the BroadcastingChunk, one just needs
to add f's operands to the BroadcastingChunk:
```
ay, by, av, bv, aw, bw = BroadcastingChunk(y, v, w)
ax = f(av, aw)
by = f(bv, bw)
a = g(ax, ay)
b = g(bx, by)
c = h(a, b)
```
cc apaszke mruberry zdevito
Pull Request resolved: https://github.com/pytorch/pytorch/pull/14055
Differential Revision: D13159259
Pulled By: zou3519
fbshipit-source-id: 134e9e645c950384d9be6a06a883a10e17a73d7d
2018-11-26 20:28:44 +00:00
|
|
|
}
|
Make cloneFrom() copy all metadata; use createClone() as much as possible.
To be honest, this was the whole point of this refactor set.
I noticed that in a lot of code, we were repeatedly copying lots of metadata
from old nodes to new nodes. This was quite concerning because I wanted to
add some more metadata (alias information) and I didn't want to have to
get it right in all cases. Plus, in a lot of cases we were forgetting
to set more optional properties like debug names when we "copied".
To solve this, I first made cloneFrom() copy all of this metadata. Then,
I searched for all occurrences of setType() (a proxy for "I'm cloning this
node), looked for cases where we really were morally doing a copy, and rewrote
the code to use cloneFrom() instead, allowing us to drop explicit setType()
(and getting more metadata preservation in the process.)
Finally, I refactored tryToMoveChunk. The code is modestly longer,
but the new version has the nice property that the initialization of
selects for input_chunk are next to the creation of the node (as opposed
to delayed for later.) I also added a lot more comments for invariants
I noticed when I was working on the code.
One minor extra change: TensorType grew a new constructor and a withSizesStride
"immutable setter" which returns a new copy of TensorType with different info.
Signed-off-by: Edward Z. Yang <ezyang@fb.com>
2017-09-14 04:57:05 +00:00
|
|
|
|
Allow graph fuser to move chunks past multiple nodes. (#14055)
Summary:
Fixes #12290. Also speeds up JIT LSTM forward pass from 8.8ms to 7.8ms; previously, each JIT lstm cell used 2 fused kernels. Now, it only uses one fused kernel (which is how many kernels cudnn uses).
Explanation:
Let f, g, h be fusible ops.
```
x = f(v, w)
z = g(x, y)
a, b = chunk(z)
c = h(a, b)
```
becomes (before this PR):
```
x = f(v, w)
x', y' = broadcast_tensors([x, y])
ax, bx = chunk(x')
ay, by = chunk(y')
a = g(ax, ay)
b = g(bx, by)
c = h(a, b)
```
The graph fuser then puts g, g, and h into one FusionGroup and is unable
to move `x = f(v, w)` into the FusionGroup.
This PR lets the graph fuser move `x = f(v, w)` into the FusionGroup.
It does this by abstracting the broadcast_tensors + multiple chunk nodes
into one intermediate `prim::BroadcastingChunk[chunks, dim]` node.
A `BroadcastingChunk[chunks, dim](*inputs)` node is equivalent to:
- broadcasting all of *inputs
- chunk-ing each broadcasted input into `chunks` chunks along dim `dim`.
Abstracting the broadcasting chunk behavior away, it is now a lot easier
for the graph fuser to move (broadcast + chunk) past an operation. After
this PR, the above graph becomes:
```
x = f(v, w)
ax, bx, ay, by = BroadcastingChunk(x, y)
a = g(ax, ay)
b = g(bx, by)
c = h(a, b)
```
Now, to move `x = f(v, w)` after the BroadcastingChunk, one just needs
to add f's operands to the BroadcastingChunk:
```
ay, by, av, bv, aw, bw = BroadcastingChunk(y, v, w)
ax = f(av, aw)
by = f(bv, bw)
a = g(ax, ay)
b = g(bx, by)
c = h(a, b)
```
cc apaszke mruberry zdevito
Pull Request resolved: https://github.com/pytorch/pytorch/pull/14055
Differential Revision: D13159259
Pulled By: zou3519
fbshipit-source-id: 134e9e645c950384d9be6a06a883a10e17a73d7d
2018-11-26 20:28:44 +00:00
|
|
|
// Add each of op's operands to the bchunk node.
|
Make cloneFrom() copy all metadata; use createClone() as much as possible.
To be honest, this was the whole point of this refactor set.
I noticed that in a lot of code, we were repeatedly copying lots of metadata
from old nodes to new nodes. This was quite concerning because I wanted to
add some more metadata (alias information) and I didn't want to have to
get it right in all cases. Plus, in a lot of cases we were forgetting
to set more optional properties like debug names when we "copied".
To solve this, I first made cloneFrom() copy all of this metadata. Then,
I searched for all occurrences of setType() (a proxy for "I'm cloning this
node), looked for cases where we really were morally doing a copy, and rewrote
the code to use cloneFrom() instead, allowing us to drop explicit setType()
(and getting more metadata preservation in the process.)
Finally, I refactored tryToMoveChunk. The code is modestly longer,
but the new version has the nice property that the initialization of
selects for input_chunk are next to the creation of the node (as opposed
to delayed for later.) I also added a lot more comments for invariants
I noticed when I was working on the code.
One minor extra change: TensorType grew a new constructor and a withSizesStride
"immutable setter" which returns a new copy of TensorType with different info.
Signed-off-by: Edward Z. Yang <ezyang@fb.com>
2017-09-14 04:57:05 +00:00
|
|
|
// chunked_inputs[input_nr][chunk_output_idx]
|
|
|
|
|
// = Node* for chunk_output_idx'th output of the chunk(inputs[input_nr])
|
Refactor ir.h to distinguish Nodes and Values
This commit adds a Value type similar to the one @ezyang suggested a while
ago for handling multi-return nodes.
Previously if we had a graph like:
a = op1(b)
c, d = op2(a)
Then its in-memory format would look like:
%0 = op1(b)
%1 = op2(%0)
%2 = select(%1, 0)
%2 = select(%1, 1)
Select nodes were used only to handle the multi-output case. In the
single-output case ops referred directly to their uses.
This required special handling for the single- and multi- output cases,
and was confusing when used with ONNX which distinguishes values (the
inputs/outputs of a node) from the nodes themselves (e.g. a Conv).
This commit adds the Node/Value distinction to the IR. In the example
above, `a`, `b`, `c`, and `d` are now Value objects, while `op1` and
`op2` are now Node objects. Inputs/Outputs to the graph are values.
* Nodes now always have multiple outputs, accessible through their `output()`
method.
* Methods exist for adding/removing outputs from a node.
* Nodes own their output Values, destroying a node destroys its outputs and it
is only valid to destroy a node when no uses of its outputs remain.
* Unlike select, Values do not appear in the nodes list.
* The method `node()` on `Value` retrieves its defining node. Calling it
is always valid. For inputs, its kind is "Param". Like "Return" there is a single Param
node representing all inputs.
* For single-output Nodes, the method `output()` retrieves the single
output Value, asserting that the node is in-fact single output.
* Functions are the same, but some functions like `type()` have moved to
Value.
* `replaceAllUsesWith` is now sanely defined for both Values and Nodes.
In the case of Nodes, it replaces all outputs of the node with the outputs
of the replacement node.
* stage is defined both on Node/Value. This is because Inputs require a stage.
* Apart from changing data types from Node->Value most passes remain the same.
Things that previously assumed single-output nodes now have to call output()
to get the node.
* This removes the uses = [...] field in the outputs because it was
getting confusing even before this commit when uses would refer to nodes,
but we print the names of Values. The lint pass validates the use list,
so printing it out seems less necessary.
2017-11-14 11:05:32 +00:00
|
|
|
std::vector<std::vector<Value*>> chunked_inputs;
|
Allow graph fuser to move chunks past multiple nodes. (#14055)
Summary:
Fixes #12290. Also speeds up JIT LSTM forward pass from 8.8ms to 7.8ms; previously, each JIT lstm cell used 2 fused kernels. Now, it only uses one fused kernel (which is how many kernels cudnn uses).
Explanation:
Let f, g, h be fusible ops.
```
x = f(v, w)
z = g(x, y)
a, b = chunk(z)
c = h(a, b)
```
becomes (before this PR):
```
x = f(v, w)
x', y' = broadcast_tensors([x, y])
ax, bx = chunk(x')
ay, by = chunk(y')
a = g(ax, ay)
b = g(bx, by)
c = h(a, b)
```
The graph fuser then puts g, g, and h into one FusionGroup and is unable
to move `x = f(v, w)` into the FusionGroup.
This PR lets the graph fuser move `x = f(v, w)` into the FusionGroup.
It does this by abstracting the broadcast_tensors + multiple chunk nodes
into one intermediate `prim::BroadcastingChunk[chunks, dim]` node.
A `BroadcastingChunk[chunks, dim](*inputs)` node is equivalent to:
- broadcasting all of *inputs
- chunk-ing each broadcasted input into `chunks` chunks along dim `dim`.
Abstracting the broadcasting chunk behavior away, it is now a lot easier
for the graph fuser to move (broadcast + chunk) past an operation. After
this PR, the above graph becomes:
```
x = f(v, w)
ax, bx, ay, by = BroadcastingChunk(x, y)
a = g(ax, ay)
b = g(bx, by)
c = h(a, b)
```
Now, to move `x = f(v, w)` after the BroadcastingChunk, one just needs
to add f's operands to the BroadcastingChunk:
```
ay, by, av, bv, aw, bw = BroadcastingChunk(y, v, w)
ax = f(av, aw)
by = f(bv, bw)
a = g(ax, ay)
b = g(bx, by)
c = h(a, b)
```
cc apaszke mruberry zdevito
Pull Request resolved: https://github.com/pytorch/pytorch/pull/14055
Differential Revision: D13159259
Pulled By: zou3519
fbshipit-source-id: 134e9e645c950384d9be6a06a883a10e17a73d7d
2018-11-26 20:28:44 +00:00
|
|
|
|
Refactor ir.h to distinguish Nodes and Values
This commit adds a Value type similar to the one @ezyang suggested a while
ago for handling multi-return nodes.
Previously if we had a graph like:
a = op1(b)
c, d = op2(a)
Then its in-memory format would look like:
%0 = op1(b)
%1 = op2(%0)
%2 = select(%1, 0)
%2 = select(%1, 1)
Select nodes were used only to handle the multi-output case. In the
single-output case ops referred directly to their uses.
This required special handling for the single- and multi- output cases,
and was confusing when used with ONNX which distinguishes values (the
inputs/outputs of a node) from the nodes themselves (e.g. a Conv).
This commit adds the Node/Value distinction to the IR. In the example
above, `a`, `b`, `c`, and `d` are now Value objects, while `op1` and
`op2` are now Node objects. Inputs/Outputs to the graph are values.
* Nodes now always have multiple outputs, accessible through their `output()`
method.
* Methods exist for adding/removing outputs from a node.
* Nodes own their output Values, destroying a node destroys its outputs and it
is only valid to destroy a node when no uses of its outputs remain.
* Unlike select, Values do not appear in the nodes list.
* The method `node()` on `Value` retrieves its defining node. Calling it
is always valid. For inputs, its kind is "Param". Like "Return" there is a single Param
node representing all inputs.
* For single-output Nodes, the method `output()` retrieves the single
output Value, asserting that the node is in-fact single output.
* Functions are the same, but some functions like `type()` have moved to
Value.
* `replaceAllUsesWith` is now sanely defined for both Values and Nodes.
In the case of Nodes, it replaces all outputs of the node with the outputs
of the replacement node.
* stage is defined both on Node/Value. This is because Inputs require a stage.
* Apart from changing data types from Node->Value most passes remain the same.
Things that previously assumed single-output nodes now have to call output()
to get the node.
* This removes the uses = [...] field in the outputs because it was
getting confusing even before this commit when uses would refer to nodes,
but we print the names of Values. The lint pass validates the use list,
so printing it out seems less necessary.
2017-11-14 11:05:32 +00:00
|
|
|
for (auto input : producer_for_chunk_node->inputs()) {
|
2018-12-26 14:52:25 +00:00
|
|
|
// XXX: we only work with pointwise ops in here, so we know it is valid to
|
|
|
|
|
// push the concat only through tensor arguments (and all other args can
|
|
|
|
|
// be safely ignored).
|
Rename DynamicType -> TensorType (#16787)
Summary:
```
import json
from subprocess import check_call
from pprint import pprint
renames = {
'c10::TensorType': 'DimentionedTensorType',
'c10::DynamicType': 'TensorType',
'c10::TensorTypePtr': 'DimentionedTensorTypePtr',
'c10::DynamicTypePtr': 'TensorTypePtr',
'c10::TypeKind::DynamicType': 'TensorType',
'c10::TypeKind::TensorType': 'DimentionedTensorType',
}
entries = json.loads(open('compile_commands.json', 'r').read())
build = None
sources = []
for e in entries:
name = e['file']
if not ('jit' in name or 'ATen/core' in name):
continue
build = e['directory']
sources.append(name)
args = ['clang-rename', '-i', '-force', '-pl']
for name in sorted(renames.keys()):
args += ['-qualified-name={}'.format(name), '-new-name={}'.format(renames[name])]
for source in sources:
cmd = args + [source]
pprint(args)
check_call(cmd, cwd=build)
check_call(['git', 'stash', 'push', '-m', 'rename'])
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/16787
Differential Revision: D13974132
Pulled By: zdevito
fbshipit-source-id: 8368fd53e17cff83707bbe77f2d7aad74f8ce60e
2019-02-07 01:22:47 +00:00
|
|
|
if (!input->type()->isSubtypeOf(TensorType::get()))
|
Unify IR operator representation (stop using attributes in the JIT) (#9807)
Summary:
Based on top of #9763 (first 3 commits belong to that PR). The first commits from this PR are "Stop using attributes ..."
I tried to separate the changes into fairly meaningful commits. I can't split them up into smaller PRs, because everything starts working and all tests pass only after the whole sequence, but hopefully this will make reviewing somewhat easier.
Known issues/regressions/future tasks:
- `aten::lerp` and `aten::clamp` are no longer fusable
- `CreateAutodiffSubgraphs` needs a rewrite
- It is much more strict now, and will miss a lot of opportunities, especially when viewing ops are involved. Our previous approach was "ignore the assumption on shape availability in gradient formulas to determine differentiability, and hope that shape prop will be robust enough to actually deliver them before we differentiate", which obviously doesn't scale well to more complex cases. We should either work on reducing the size dependency of grad formulas (feasible e.g. for `view`/`reshape`, unfeasible for `squeeze`/`unsqueeze`), or make `CreateAutodiffSubgraphs` integrate some kind of "I could integrate this node into an AD subgraph, but will I be able to infer the shape of its input" reasoning (kind of like a limited shape prop, that doesn't infer anything, and only tells if it *could* infer something).
- It sometimes creates constant-only (or constants + one node) graphs, which is useless
- Broken `aten::add` in auto-batching, because it gained a non-tensor input. I changed the test for pointwise operations to use `aten::mul` instead, but I needed to disable the LSTM cell test. I'm not sure how scalar constants should be implemented in this case, because I don't fully understand our format. cc: ChunliF
- Graph import does some hacks to recover type of constants. This code should be removed once we'll gain the ability to export the IR along with value types.
- There's still a fair amount of dead code that can be removed. I didn't want to make this diff any bigger, and removing it is an easy task.
- Graph fuser could be improved to use signature matching (possibly using `OperatorSet`) instead of basing on node kinds.
- Manual constant propagation for the `ListConstruct` node in `torch/onnx/utils.py` should be replaced with a proper constant propagation pass (or we should ensure that the one we have handles at least this case before we remove this code).
zdevito
Pull Request resolved: https://github.com/pytorch/pytorch/pull/9807
Reviewed By: ezyang
Differential Revision: D9004285
Pulled By: apaszke
fbshipit-source-id: fe88026a765f6b687354add034c86402362508b7
2018-07-27 05:03:44 +00:00
|
|
|
continue;
|
Allow graph fuser to move chunks past multiple nodes. (#14055)
Summary:
Fixes #12290. Also speeds up JIT LSTM forward pass from 8.8ms to 7.8ms; previously, each JIT lstm cell used 2 fused kernels. Now, it only uses one fused kernel (which is how many kernels cudnn uses).
Explanation:
Let f, g, h be fusible ops.
```
x = f(v, w)
z = g(x, y)
a, b = chunk(z)
c = h(a, b)
```
becomes (before this PR):
```
x = f(v, w)
x', y' = broadcast_tensors([x, y])
ax, bx = chunk(x')
ay, by = chunk(y')
a = g(ax, ay)
b = g(bx, by)
c = h(a, b)
```
The graph fuser then puts g, g, and h into one FusionGroup and is unable
to move `x = f(v, w)` into the FusionGroup.
This PR lets the graph fuser move `x = f(v, w)` into the FusionGroup.
It does this by abstracting the broadcast_tensors + multiple chunk nodes
into one intermediate `prim::BroadcastingChunk[chunks, dim]` node.
A `BroadcastingChunk[chunks, dim](*inputs)` node is equivalent to:
- broadcasting all of *inputs
- chunk-ing each broadcasted input into `chunks` chunks along dim `dim`.
Abstracting the broadcasting chunk behavior away, it is now a lot easier
for the graph fuser to move (broadcast + chunk) past an operation. After
this PR, the above graph becomes:
```
x = f(v, w)
ax, bx, ay, by = BroadcastingChunk(x, y)
a = g(ax, ay)
b = g(bx, by)
c = h(a, b)
```
Now, to move `x = f(v, w)` after the BroadcastingChunk, one just needs
to add f's operands to the BroadcastingChunk:
```
ay, by, av, bv, aw, bw = BroadcastingChunk(y, v, w)
ax = f(av, aw)
by = f(bv, bw)
a = g(ax, ay)
b = g(bx, by)
c = h(a, b)
```
cc apaszke mruberry zdevito
Pull Request resolved: https://github.com/pytorch/pytorch/pull/14055
Differential Revision: D13159259
Pulled By: zou3519
fbshipit-source-id: 134e9e645c950384d9be6a06a883a10e17a73d7d
2018-11-26 20:28:44 +00:00
|
|
|
|
|
|
|
|
// if 'input' is already an input to the bchunk, reuse it.
|
|
|
|
|
auto bchunk_inputs = bchunk->inputs();
|
|
|
|
|
auto it = std::find(bchunk_inputs.begin(), bchunk_inputs.end(), input);
|
|
|
|
|
if (it != bchunk_inputs.end()) {
|
|
|
|
|
chunked_inputs.emplace_back();
|
|
|
|
|
auto input_index = std::distance(bchunk_inputs.begin(), it);
|
|
|
|
|
for (size_t chunk = 0; chunk < nchunks; ++chunk) {
|
|
|
|
|
chunked_inputs.back().push_back(
|
|
|
|
|
bchunk->outputs().at(nchunks * input_index + chunk));
|
|
|
|
|
}
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
Make cloneFrom() copy all metadata; use createClone() as much as possible.
To be honest, this was the whole point of this refactor set.
I noticed that in a lot of code, we were repeatedly copying lots of metadata
from old nodes to new nodes. This was quite concerning because I wanted to
add some more metadata (alias information) and I didn't want to have to
get it right in all cases. Plus, in a lot of cases we were forgetting
to set more optional properties like debug names when we "copied".
To solve this, I first made cloneFrom() copy all of this metadata. Then,
I searched for all occurrences of setType() (a proxy for "I'm cloning this
node), looked for cases where we really were morally doing a copy, and rewrote
the code to use cloneFrom() instead, allowing us to drop explicit setType()
(and getting more metadata preservation in the process.)
Finally, I refactored tryToMoveChunk. The code is modestly longer,
but the new version has the nice property that the initialization of
selects for input_chunk are next to the creation of the node (as opposed
to delayed for later.) I also added a lot more comments for invariants
I noticed when I was working on the code.
One minor extra change: TensorType grew a new constructor and a withSizesStride
"immutable setter" which returns a new copy of TensorType with different info.
Signed-off-by: Edward Z. Yang <ezyang@fb.com>
2017-09-14 04:57:05 +00:00
|
|
|
// NB: I decided not to use cloneFrom here, because if we make cloneFrom
|
|
|
|
|
// copy selects one day, it is definitely not what you want here (selects
|
|
|
|
|
// have different types).
|
Trace ATen native functions as themselves, not their implementations. (#4127)
* Trace ATen non-primitive functions as themselves, not their implementations.
Previously, if I invoked an ATen non-primitive function foo, which in turn
called subfoo, I would always see 'subfoo' in the trace (e.g., tracing
'inlines' all of these operations.) Such inlining is bad for ONNX
(and can be bad for optimization) as it prevents high-level
optimizations from taking advantage of the structure. It might
be right to inline, but give the optimizer a chance to work before
inlining happens!
The implementation here is surprisingly simple, because it uses
the "DCE trick". Essentially, it doesn't matter if the constituent
calls perform tracing, because you can always trace it again, and
override the trace nodes associated with the returned variables.
The original trace becomes dead and can be DCE'd.
While implementing this, I also refactored how 'isTracing' and
'trace_outputs' works:
- isTracing was previously a single function with overloads for
both Tensor and Variable arguments. Unfortunately, such overloads
are not safe, because of how C++ implicit conversions work. You
would think that C++ should never confuse an overload for
Variable with ArrayRef<Tensor>, but this is exactly what can
happen: Tensor is convertible to both Variable and ArrayRef<Tensor>,
thus it's ambiguous and C++ doesn't like it. The last time I ran
into this problem, I applied initializer lists to everything and
called it a day. A more robust fix is to separate out the
Variable and Tensor overloads, which I have done in this patch.
- trace_outputs was fed as an initializer list, which doesn't work
when you have heterogenous inputs. So instead we first feed
everything through 'flatten', which has overloads for each of the
argument patterns in ATen, which then goes on to the recordTrace
(which takes an ArrayRef). This is *no less efficient*, because
we were allocating a vector anyway (to do the conversion from
vector of Tensor to vector of Variable).
This fixes mean that 'index' can properly be traced... although the
JIT still does not support it. A failing test case has been added to
this effect.
Some knock-on effects:
- The fuser now knows about chunk as well as split. They're pretty
similar so there is no problem.
- There is a new 'canonicalize' pass in the JIT which renumbers a graph
so that all structurally equivalent graphs render the same.
- We run DCE before the fuser tests, to make sure dead nodes don't
block fusion.
- There are new ONNX exports for the newly introduced higher level ATen
operations. This includes type_as (no-op case only), chunk, select.
Zach didn't like the extra use of 'native' in the new codegen, so
we've introduced a new concept, 'abstract'. An abstract function
is one that is implemented in derived types (e.g., CPUDoubleType),
where as a concrete one is implemented in the base type (Type).
Signed-off-by: Edward Z. Yang <ezyang@fb.com>
2017-12-15 18:50:32 +00:00
|
|
|
// TODO: Perhaps we should use cloneFrom now, as it seems unlikely
|
|
|
|
|
// to copy select nodes now that we have refactored to have a Value
|
|
|
|
|
// distinct from Node.
|
Allow graph fuser to move chunks past multiple nodes. (#14055)
Summary:
Fixes #12290. Also speeds up JIT LSTM forward pass from 8.8ms to 7.8ms; previously, each JIT lstm cell used 2 fused kernels. Now, it only uses one fused kernel (which is how many kernels cudnn uses).
Explanation:
Let f, g, h be fusible ops.
```
x = f(v, w)
z = g(x, y)
a, b = chunk(z)
c = h(a, b)
```
becomes (before this PR):
```
x = f(v, w)
x', y' = broadcast_tensors([x, y])
ax, bx = chunk(x')
ay, by = chunk(y')
a = g(ax, ay)
b = g(bx, by)
c = h(a, b)
```
The graph fuser then puts g, g, and h into one FusionGroup and is unable
to move `x = f(v, w)` into the FusionGroup.
This PR lets the graph fuser move `x = f(v, w)` into the FusionGroup.
It does this by abstracting the broadcast_tensors + multiple chunk nodes
into one intermediate `prim::BroadcastingChunk[chunks, dim]` node.
A `BroadcastingChunk[chunks, dim](*inputs)` node is equivalent to:
- broadcasting all of *inputs
- chunk-ing each broadcasted input into `chunks` chunks along dim `dim`.
Abstracting the broadcasting chunk behavior away, it is now a lot easier
for the graph fuser to move (broadcast + chunk) past an operation. After
this PR, the above graph becomes:
```
x = f(v, w)
ax, bx, ay, by = BroadcastingChunk(x, y)
a = g(ax, ay)
b = g(bx, by)
c = h(a, b)
```
Now, to move `x = f(v, w)` after the BroadcastingChunk, one just needs
to add f's operands to the BroadcastingChunk:
```
ay, by, av, bv, aw, bw = BroadcastingChunk(y, v, w)
ax = f(av, aw)
by = f(bv, bw)
a = g(ax, ay)
b = g(bx, by)
c = h(a, b)
```
cc apaszke mruberry zdevito
Pull Request resolved: https://github.com/pytorch/pytorch/pull/14055
Differential Revision: D13159259
Pulled By: zou3519
fbshipit-source-id: 134e9e645c950384d9be6a06a883a10e17a73d7d
2018-11-26 20:28:44 +00:00
|
|
|
bchunk->addInput(input);
|
Make cloneFrom() copy all metadata; use createClone() as much as possible.
To be honest, this was the whole point of this refactor set.
I noticed that in a lot of code, we were repeatedly copying lots of metadata
from old nodes to new nodes. This was quite concerning because I wanted to
add some more metadata (alias information) and I didn't want to have to
get it right in all cases. Plus, in a lot of cases we were forgetting
to set more optional properties like debug names when we "copied".
To solve this, I first made cloneFrom() copy all of this metadata. Then,
I searched for all occurrences of setType() (a proxy for "I'm cloning this
node), looked for cases where we really were morally doing a copy, and rewrote
the code to use cloneFrom() instead, allowing us to drop explicit setType()
(and getting more metadata preservation in the process.)
Finally, I refactored tryToMoveChunk. The code is modestly longer,
but the new version has the nice property that the initialization of
selects for input_chunk are next to the creation of the node (as opposed
to delayed for later.) I also added a lot more comments for invariants
I noticed when I was working on the code.
One minor extra change: TensorType grew a new constructor and a withSizesStride
"immutable setter" which returns a new copy of TensorType with different info.
Signed-off-by: Edward Z. Yang <ezyang@fb.com>
2017-09-14 04:57:05 +00:00
|
|
|
chunked_inputs.emplace_back(); // alas, to not be C++17
|
Allow graph fuser to move chunks past multiple nodes. (#14055)
Summary:
Fixes #12290. Also speeds up JIT LSTM forward pass from 8.8ms to 7.8ms; previously, each JIT lstm cell used 2 fused kernels. Now, it only uses one fused kernel (which is how many kernels cudnn uses).
Explanation:
Let f, g, h be fusible ops.
```
x = f(v, w)
z = g(x, y)
a, b = chunk(z)
c = h(a, b)
```
becomes (before this PR):
```
x = f(v, w)
x', y' = broadcast_tensors([x, y])
ax, bx = chunk(x')
ay, by = chunk(y')
a = g(ax, ay)
b = g(bx, by)
c = h(a, b)
```
The graph fuser then puts g, g, and h into one FusionGroup and is unable
to move `x = f(v, w)` into the FusionGroup.
This PR lets the graph fuser move `x = f(v, w)` into the FusionGroup.
It does this by abstracting the broadcast_tensors + multiple chunk nodes
into one intermediate `prim::BroadcastingChunk[chunks, dim]` node.
A `BroadcastingChunk[chunks, dim](*inputs)` node is equivalent to:
- broadcasting all of *inputs
- chunk-ing each broadcasted input into `chunks` chunks along dim `dim`.
Abstracting the broadcasting chunk behavior away, it is now a lot easier
for the graph fuser to move (broadcast + chunk) past an operation. After
this PR, the above graph becomes:
```
x = f(v, w)
ax, bx, ay, by = BroadcastingChunk(x, y)
a = g(ax, ay)
b = g(bx, by)
c = h(a, b)
```
Now, to move `x = f(v, w)` after the BroadcastingChunk, one just needs
to add f's operands to the BroadcastingChunk:
```
ay, by, av, bv, aw, bw = BroadcastingChunk(y, v, w)
ax = f(av, aw)
by = f(bv, bw)
a = g(ax, ay)
b = g(bx, by)
c = h(a, b)
```
cc apaszke mruberry zdevito
Pull Request resolved: https://github.com/pytorch/pytorch/pull/14055
Differential Revision: D13159259
Pulled By: zou3519
fbshipit-source-id: 134e9e645c950384d9be6a06a883a10e17a73d7d
2018-11-26 20:28:44 +00:00
|
|
|
for (auto chunk_sel : producer_chunk_outputs) {
|
2018-12-26 14:52:25 +00:00
|
|
|
Value* input_chunk_sel = bchunk->addOutput();
|
Allow graph fuser to move chunks past multiple nodes. (#14055)
Summary:
Fixes #12290. Also speeds up JIT LSTM forward pass from 8.8ms to 7.8ms; previously, each JIT lstm cell used 2 fused kernels. Now, it only uses one fused kernel (which is how many kernels cudnn uses).
Explanation:
Let f, g, h be fusible ops.
```
x = f(v, w)
z = g(x, y)
a, b = chunk(z)
c = h(a, b)
```
becomes (before this PR):
```
x = f(v, w)
x', y' = broadcast_tensors([x, y])
ax, bx = chunk(x')
ay, by = chunk(y')
a = g(ax, ay)
b = g(bx, by)
c = h(a, b)
```
The graph fuser then puts g, g, and h into one FusionGroup and is unable
to move `x = f(v, w)` into the FusionGroup.
This PR lets the graph fuser move `x = f(v, w)` into the FusionGroup.
It does this by abstracting the broadcast_tensors + multiple chunk nodes
into one intermediate `prim::BroadcastingChunk[chunks, dim]` node.
A `BroadcastingChunk[chunks, dim](*inputs)` node is equivalent to:
- broadcasting all of *inputs
- chunk-ing each broadcasted input into `chunks` chunks along dim `dim`.
Abstracting the broadcasting chunk behavior away, it is now a lot easier
for the graph fuser to move (broadcast + chunk) past an operation. After
this PR, the above graph becomes:
```
x = f(v, w)
ax, bx, ay, by = BroadcastingChunk(x, y)
a = g(ax, ay)
b = g(bx, by)
c = h(a, b)
```
Now, to move `x = f(v, w)` after the BroadcastingChunk, one just needs
to add f's operands to the BroadcastingChunk:
```
ay, by, av, bv, aw, bw = BroadcastingChunk(y, v, w)
ax = f(av, aw)
by = f(bv, bw)
a = g(ax, ay)
b = g(bx, by)
c = h(a, b)
```
cc apaszke mruberry zdevito
Pull Request resolved: https://github.com/pytorch/pytorch/pull/14055
Differential Revision: D13159259
Pulled By: zou3519
fbshipit-source-id: 134e9e645c950384d9be6a06a883a10e17a73d7d
2018-11-26 20:28:44 +00:00
|
|
|
input_chunk_sel->setType(chunk_sel->type());
|
|
|
|
|
chunked_inputs.back().push_back(input_chunk_sel);
|
Make cloneFrom() copy all metadata; use createClone() as much as possible.
To be honest, this was the whole point of this refactor set.
I noticed that in a lot of code, we were repeatedly copying lots of metadata
from old nodes to new nodes. This was quite concerning because I wanted to
add some more metadata (alias information) and I didn't want to have to
get it right in all cases. Plus, in a lot of cases we were forgetting
to set more optional properties like debug names when we "copied".
To solve this, I first made cloneFrom() copy all of this metadata. Then,
I searched for all occurrences of setType() (a proxy for "I'm cloning this
node), looked for cases where we really were morally doing a copy, and rewrote
the code to use cloneFrom() instead, allowing us to drop explicit setType()
(and getting more metadata preservation in the process.)
Finally, I refactored tryToMoveChunk. The code is modestly longer,
but the new version has the nice property that the initialization of
selects for input_chunk are next to the creation of the node (as opposed
to delayed for later.) I also added a lot more comments for invariants
I noticed when I was working on the code.
One minor extra change: TensorType grew a new constructor and a withSizesStride
"immutable setter" which returns a new copy of TensorType with different info.
Signed-off-by: Edward Z. Yang <ezyang@fb.com>
2017-09-14 04:57:05 +00:00
|
|
|
}
|
2017-08-01 04:46:41 +00:00
|
|
|
}
|
|
|
|
|
|
Make cloneFrom() copy all metadata; use createClone() as much as possible.
To be honest, this was the whole point of this refactor set.
I noticed that in a lot of code, we were repeatedly copying lots of metadata
from old nodes to new nodes. This was quite concerning because I wanted to
add some more metadata (alias information) and I didn't want to have to
get it right in all cases. Plus, in a lot of cases we were forgetting
to set more optional properties like debug names when we "copied".
To solve this, I first made cloneFrom() copy all of this metadata. Then,
I searched for all occurrences of setType() (a proxy for "I'm cloning this
node), looked for cases where we really were morally doing a copy, and rewrote
the code to use cloneFrom() instead, allowing us to drop explicit setType()
(and getting more metadata preservation in the process.)
Finally, I refactored tryToMoveChunk. The code is modestly longer,
but the new version has the nice property that the initialization of
selects for input_chunk are next to the creation of the node (as opposed
to delayed for later.) I also added a lot more comments for invariants
I noticed when I was working on the code.
One minor extra change: TensorType grew a new constructor and a withSizesStride
"immutable setter" which returns a new copy of TensorType with different info.
Signed-off-by: Edward Z. Yang <ezyang@fb.com>
2017-09-14 04:57:05 +00:00
|
|
|
// apply the op to each chunk of the chunked operands,
|
|
|
|
|
// and then rewrite the graph to use them!
|
Allow graph fuser to move chunks past multiple nodes. (#14055)
Summary:
Fixes #12290. Also speeds up JIT LSTM forward pass from 8.8ms to 7.8ms; previously, each JIT lstm cell used 2 fused kernels. Now, it only uses one fused kernel (which is how many kernels cudnn uses).
Explanation:
Let f, g, h be fusible ops.
```
x = f(v, w)
z = g(x, y)
a, b = chunk(z)
c = h(a, b)
```
becomes (before this PR):
```
x = f(v, w)
x', y' = broadcast_tensors([x, y])
ax, bx = chunk(x')
ay, by = chunk(y')
a = g(ax, ay)
b = g(bx, by)
c = h(a, b)
```
The graph fuser then puts g, g, and h into one FusionGroup and is unable
to move `x = f(v, w)` into the FusionGroup.
This PR lets the graph fuser move `x = f(v, w)` into the FusionGroup.
It does this by abstracting the broadcast_tensors + multiple chunk nodes
into one intermediate `prim::BroadcastingChunk[chunks, dim]` node.
A `BroadcastingChunk[chunks, dim](*inputs)` node is equivalent to:
- broadcasting all of *inputs
- chunk-ing each broadcasted input into `chunks` chunks along dim `dim`.
Abstracting the broadcasting chunk behavior away, it is now a lot easier
for the graph fuser to move (broadcast + chunk) past an operation. After
this PR, the above graph becomes:
```
x = f(v, w)
ax, bx, ay, by = BroadcastingChunk(x, y)
a = g(ax, ay)
b = g(bx, by)
c = h(a, b)
```
Now, to move `x = f(v, w)` after the BroadcastingChunk, one just needs
to add f's operands to the BroadcastingChunk:
```
ay, by, av, bv, aw, bw = BroadcastingChunk(y, v, w)
ax = f(av, aw)
by = f(bv, bw)
a = g(ax, ay)
b = g(bx, by)
c = h(a, b)
```
cc apaszke mruberry zdevito
Pull Request resolved: https://github.com/pytorch/pytorch/pull/14055
Differential Revision: D13159259
Pulled By: zou3519
fbshipit-source-id: 134e9e645c950384d9be6a06a883a10e17a73d7d
2018-11-26 20:28:44 +00:00
|
|
|
for (auto chunk_sel : producer_chunk_outputs) {
|
Unify IR operator representation (stop using attributes in the JIT) (#9807)
Summary:
Based on top of #9763 (first 3 commits belong to that PR). The first commits from this PR are "Stop using attributes ..."
I tried to separate the changes into fairly meaningful commits. I can't split them up into smaller PRs, because everything starts working and all tests pass only after the whole sequence, but hopefully this will make reviewing somewhat easier.
Known issues/regressions/future tasks:
- `aten::lerp` and `aten::clamp` are no longer fusable
- `CreateAutodiffSubgraphs` needs a rewrite
- It is much more strict now, and will miss a lot of opportunities, especially when viewing ops are involved. Our previous approach was "ignore the assumption on shape availability in gradient formulas to determine differentiability, and hope that shape prop will be robust enough to actually deliver them before we differentiate", which obviously doesn't scale well to more complex cases. We should either work on reducing the size dependency of grad formulas (feasible e.g. for `view`/`reshape`, unfeasible for `squeeze`/`unsqueeze`), or make `CreateAutodiffSubgraphs` integrate some kind of "I could integrate this node into an AD subgraph, but will I be able to infer the shape of its input" reasoning (kind of like a limited shape prop, that doesn't infer anything, and only tells if it *could* infer something).
- It sometimes creates constant-only (or constants + one node) graphs, which is useless
- Broken `aten::add` in auto-batching, because it gained a non-tensor input. I changed the test for pointwise operations to use `aten::mul` instead, but I needed to disable the LSTM cell test. I'm not sure how scalar constants should be implemented in this case, because I don't fully understand our format. cc: ChunliF
- Graph import does some hacks to recover type of constants. This code should be removed once we'll gain the ability to export the IR along with value types.
- There's still a fair amount of dead code that can be removed. I didn't want to make this diff any bigger, and removing it is an easy task.
- Graph fuser could be improved to use signature matching (possibly using `OperatorSet`) instead of basing on node kinds.
- Manual constant propagation for the `ListConstruct` node in `torch/onnx/utils.py` should be replaced with a proper constant propagation pass (or we should ensure that the one we have handles at least this case before we remove this code).
zdevito
Pull Request resolved: https://github.com/pytorch/pytorch/pull/9807
Reviewed By: ezyang
Differential Revision: D9004285
Pulled By: apaszke
fbshipit-source-id: fe88026a765f6b687354add034c86402362508b7
2018-07-27 05:03:44 +00:00
|
|
|
auto original_inputs = producer_for_chunk_node->inputs();
|
2018-12-26 14:52:25 +00:00
|
|
|
Node* chunked_op =
|
|
|
|
|
block_->owningGraph()->create(producer_for_chunk_node->kind());
|
Refactor ir.h to distinguish Nodes and Values
This commit adds a Value type similar to the one @ezyang suggested a while
ago for handling multi-return nodes.
Previously if we had a graph like:
a = op1(b)
c, d = op2(a)
Then its in-memory format would look like:
%0 = op1(b)
%1 = op2(%0)
%2 = select(%1, 0)
%2 = select(%1, 1)
Select nodes were used only to handle the multi-output case. In the
single-output case ops referred directly to their uses.
This required special handling for the single- and multi- output cases,
and was confusing when used with ONNX which distinguishes values (the
inputs/outputs of a node) from the nodes themselves (e.g. a Conv).
This commit adds the Node/Value distinction to the IR. In the example
above, `a`, `b`, `c`, and `d` are now Value objects, while `op1` and
`op2` are now Node objects. Inputs/Outputs to the graph are values.
* Nodes now always have multiple outputs, accessible through their `output()`
method.
* Methods exist for adding/removing outputs from a node.
* Nodes own their output Values, destroying a node destroys its outputs and it
is only valid to destroy a node when no uses of its outputs remain.
* Unlike select, Values do not appear in the nodes list.
* The method `node()` on `Value` retrieves its defining node. Calling it
is always valid. For inputs, its kind is "Param". Like "Return" there is a single Param
node representing all inputs.
* For single-output Nodes, the method `output()` retrieves the single
output Value, asserting that the node is in-fact single output.
* Functions are the same, but some functions like `type()` have moved to
Value.
* `replaceAllUsesWith` is now sanely defined for both Values and Nodes.
In the case of Nodes, it replaces all outputs of the node with the outputs
of the replacement node.
* stage is defined both on Node/Value. This is because Inputs require a stage.
* Apart from changing data types from Node->Value most passes remain the same.
Things that previously assumed single-output nodes now have to call output()
to get the node.
* This removes the uses = [...] field in the outputs because it was
getting confusing even before this commit when uses would refer to nodes,
but we print the names of Values. The lint pass validates the use list,
so printing it out seems less necessary.
2017-11-14 11:05:32 +00:00
|
|
|
chunked_op->copyAttributes(*producer_for_chunk_node);
|
Prevent JIT from overspecializing to every single size configuration (#10844)
Summary:
Please review the expects carefully to make sure there are no regressions. I tried to go over them one by one when they changed, but it's sometimes easy to miss finer details.
Summary of changes:
- Renamed `TensorType` to `CompleteTensorType`. Added a new `TensorType` which records only the scalar type, number of dimensions, and device of a value. The argument behind the rename is to encourage people to use `CompleteTensorType` less, as most passes will only have limited information available. To make transition easier `complete_type->cast<TensorType>()` works, and makes our passes work with both kinds of specialization if they don't need extra the extra detail.
- Renamed `ArgumentSpec` to `CompleteArgumentSpec`. Added a new `ArgumentSpec`, which matches argument only at the level of the new `TensorType`.
- Shape analysis can process graphs with both `CompleteTensorType` and `TensorType`.
- Fuser was a part that heavily relied on full shape information being available. Now, we simply try to fuse the largest possible graphs, and have to do run-time checks to make sure they match the code we generate. If they don't, we fall back to regular interpretation. The shape checks are implementing using an optimized method exploiting algebraic properties of shapes with broadcasting, and the relations of broadcasting with pointwise ops. A full written proof of correctness of the shape checking algorithm is included in a comment in `graph_fuser.cpp`.
zdevito ezyang mruberry ngimel csarofeen
Pull Request resolved: https://github.com/pytorch/pytorch/pull/10844
Differential Revision: D9498705
Pulled By: apaszke
fbshipit-source-id: 0c53c2fcebd871cc2a29c260f8d012276479cc61
2018-08-26 16:40:58 +00:00
|
|
|
chunked_op->output()->setType(chunk_sel->type());
|
Unify IR operator representation (stop using attributes in the JIT) (#9807)
Summary:
Based on top of #9763 (first 3 commits belong to that PR). The first commits from this PR are "Stop using attributes ..."
I tried to separate the changes into fairly meaningful commits. I can't split them up into smaller PRs, because everything starts working and all tests pass only after the whole sequence, but hopefully this will make reviewing somewhat easier.
Known issues/regressions/future tasks:
- `aten::lerp` and `aten::clamp` are no longer fusable
- `CreateAutodiffSubgraphs` needs a rewrite
- It is much more strict now, and will miss a lot of opportunities, especially when viewing ops are involved. Our previous approach was "ignore the assumption on shape availability in gradient formulas to determine differentiability, and hope that shape prop will be robust enough to actually deliver them before we differentiate", which obviously doesn't scale well to more complex cases. We should either work on reducing the size dependency of grad formulas (feasible e.g. for `view`/`reshape`, unfeasible for `squeeze`/`unsqueeze`), or make `CreateAutodiffSubgraphs` integrate some kind of "I could integrate this node into an AD subgraph, but will I be able to infer the shape of its input" reasoning (kind of like a limited shape prop, that doesn't infer anything, and only tells if it *could* infer something).
- It sometimes creates constant-only (or constants + one node) graphs, which is useless
- Broken `aten::add` in auto-batching, because it gained a non-tensor input. I changed the test for pointwise operations to use `aten::mul` instead, but I needed to disable the LSTM cell test. I'm not sure how scalar constants should be implemented in this case, because I don't fully understand our format. cc: ChunliF
- Graph import does some hacks to recover type of constants. This code should be removed once we'll gain the ability to export the IR along with value types.
- There's still a fair amount of dead code that can be removed. I didn't want to make this diff any bigger, and removing it is an easy task.
- Graph fuser could be improved to use signature matching (possibly using `OperatorSet`) instead of basing on node kinds.
- Manual constant propagation for the `ListConstruct` node in `torch/onnx/utils.py` should be replaced with a proper constant propagation pass (or we should ensure that the one we have handles at least this case before we remove this code).
zdevito
Pull Request resolved: https://github.com/pytorch/pytorch/pull/9807
Reviewed By: ezyang
Differential Revision: D9004285
Pulled By: apaszke
fbshipit-source-id: fe88026a765f6b687354add034c86402362508b7
2018-07-27 05:03:44 +00:00
|
|
|
auto chunked_inputs_it = chunked_inputs.begin();
|
2018-11-03 03:24:35 +00:00
|
|
|
for (Value* original_input : original_inputs) {
|
Rename DynamicType -> TensorType (#16787)
Summary:
```
import json
from subprocess import check_call
from pprint import pprint
renames = {
'c10::TensorType': 'DimentionedTensorType',
'c10::DynamicType': 'TensorType',
'c10::TensorTypePtr': 'DimentionedTensorTypePtr',
'c10::DynamicTypePtr': 'TensorTypePtr',
'c10::TypeKind::DynamicType': 'TensorType',
'c10::TypeKind::TensorType': 'DimentionedTensorType',
}
entries = json.loads(open('compile_commands.json', 'r').read())
build = None
sources = []
for e in entries:
name = e['file']
if not ('jit' in name or 'ATen/core' in name):
continue
build = e['directory']
sources.append(name)
args = ['clang-rename', '-i', '-force', '-pl']
for name in sorted(renames.keys()):
args += ['-qualified-name={}'.format(name), '-new-name={}'.format(renames[name])]
for source in sources:
cmd = args + [source]
pprint(args)
check_call(cmd, cwd=build)
check_call(['git', 'stash', 'push', '-m', 'rename'])
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/16787
Differential Revision: D13974132
Pulled By: zdevito
fbshipit-source-id: 8368fd53e17cff83707bbe77f2d7aad74f8ce60e
2019-02-07 01:22:47 +00:00
|
|
|
if (original_input->type()->isSubtypeOf(TensorType::get())) {
|
2019-01-24 19:05:07 +00:00
|
|
|
AT_ASSERT(chunked_inputs_it != chunked_inputs.end());
|
2018-12-26 14:52:25 +00:00
|
|
|
chunked_op->addInput(
|
|
|
|
|
chunked_inputs_it->at(chunk_sel->offset() % nchunks));
|
Unify IR operator representation (stop using attributes in the JIT) (#9807)
Summary:
Based on top of #9763 (first 3 commits belong to that PR). The first commits from this PR are "Stop using attributes ..."
I tried to separate the changes into fairly meaningful commits. I can't split them up into smaller PRs, because everything starts working and all tests pass only after the whole sequence, but hopefully this will make reviewing somewhat easier.
Known issues/regressions/future tasks:
- `aten::lerp` and `aten::clamp` are no longer fusable
- `CreateAutodiffSubgraphs` needs a rewrite
- It is much more strict now, and will miss a lot of opportunities, especially when viewing ops are involved. Our previous approach was "ignore the assumption on shape availability in gradient formulas to determine differentiability, and hope that shape prop will be robust enough to actually deliver them before we differentiate", which obviously doesn't scale well to more complex cases. We should either work on reducing the size dependency of grad formulas (feasible e.g. for `view`/`reshape`, unfeasible for `squeeze`/`unsqueeze`), or make `CreateAutodiffSubgraphs` integrate some kind of "I could integrate this node into an AD subgraph, but will I be able to infer the shape of its input" reasoning (kind of like a limited shape prop, that doesn't infer anything, and only tells if it *could* infer something).
- It sometimes creates constant-only (or constants + one node) graphs, which is useless
- Broken `aten::add` in auto-batching, because it gained a non-tensor input. I changed the test for pointwise operations to use `aten::mul` instead, but I needed to disable the LSTM cell test. I'm not sure how scalar constants should be implemented in this case, because I don't fully understand our format. cc: ChunliF
- Graph import does some hacks to recover type of constants. This code should be removed once we'll gain the ability to export the IR along with value types.
- There's still a fair amount of dead code that can be removed. I didn't want to make this diff any bigger, and removing it is an easy task.
- Graph fuser could be improved to use signature matching (possibly using `OperatorSet`) instead of basing on node kinds.
- Manual constant propagation for the `ListConstruct` node in `torch/onnx/utils.py` should be replaced with a proper constant propagation pass (or we should ensure that the one we have handles at least this case before we remove this code).
zdevito
Pull Request resolved: https://github.com/pytorch/pytorch/pull/9807
Reviewed By: ezyang
Differential Revision: D9004285
Pulled By: apaszke
fbshipit-source-id: fe88026a765f6b687354add034c86402362508b7
2018-07-27 05:03:44 +00:00
|
|
|
++chunked_inputs_it;
|
|
|
|
|
} else {
|
2018-11-03 03:24:35 +00:00
|
|
|
chunked_op->addInput(original_input);
|
Unify IR operator representation (stop using attributes in the JIT) (#9807)
Summary:
Based on top of #9763 (first 3 commits belong to that PR). The first commits from this PR are "Stop using attributes ..."
I tried to separate the changes into fairly meaningful commits. I can't split them up into smaller PRs, because everything starts working and all tests pass only after the whole sequence, but hopefully this will make reviewing somewhat easier.
Known issues/regressions/future tasks:
- `aten::lerp` and `aten::clamp` are no longer fusable
- `CreateAutodiffSubgraphs` needs a rewrite
- It is much more strict now, and will miss a lot of opportunities, especially when viewing ops are involved. Our previous approach was "ignore the assumption on shape availability in gradient formulas to determine differentiability, and hope that shape prop will be robust enough to actually deliver them before we differentiate", which obviously doesn't scale well to more complex cases. We should either work on reducing the size dependency of grad formulas (feasible e.g. for `view`/`reshape`, unfeasible for `squeeze`/`unsqueeze`), or make `CreateAutodiffSubgraphs` integrate some kind of "I could integrate this node into an AD subgraph, but will I be able to infer the shape of its input" reasoning (kind of like a limited shape prop, that doesn't infer anything, and only tells if it *could* infer something).
- It sometimes creates constant-only (or constants + one node) graphs, which is useless
- Broken `aten::add` in auto-batching, because it gained a non-tensor input. I changed the test for pointwise operations to use `aten::mul` instead, but I needed to disable the LSTM cell test. I'm not sure how scalar constants should be implemented in this case, because I don't fully understand our format. cc: ChunliF
- Graph import does some hacks to recover type of constants. This code should be removed once we'll gain the ability to export the IR along with value types.
- There's still a fair amount of dead code that can be removed. I didn't want to make this diff any bigger, and removing it is an easy task.
- Graph fuser could be improved to use signature matching (possibly using `OperatorSet`) instead of basing on node kinds.
- Manual constant propagation for the `ListConstruct` node in `torch/onnx/utils.py` should be replaced with a proper constant propagation pass (or we should ensure that the one we have handles at least this case before we remove this code).
zdevito
Pull Request resolved: https://github.com/pytorch/pytorch/pull/9807
Reviewed By: ezyang
Differential Revision: D9004285
Pulled By: apaszke
fbshipit-source-id: fe88026a765f6b687354add034c86402362508b7
2018-07-27 05:03:44 +00:00
|
|
|
}
|
2017-08-09 04:17:32 +00:00
|
|
|
}
|
Allow graph fuser to move chunks past multiple nodes. (#14055)
Summary:
Fixes #12290. Also speeds up JIT LSTM forward pass from 8.8ms to 7.8ms; previously, each JIT lstm cell used 2 fused kernels. Now, it only uses one fused kernel (which is how many kernels cudnn uses).
Explanation:
Let f, g, h be fusible ops.
```
x = f(v, w)
z = g(x, y)
a, b = chunk(z)
c = h(a, b)
```
becomes (before this PR):
```
x = f(v, w)
x', y' = broadcast_tensors([x, y])
ax, bx = chunk(x')
ay, by = chunk(y')
a = g(ax, ay)
b = g(bx, by)
c = h(a, b)
```
The graph fuser then puts g, g, and h into one FusionGroup and is unable
to move `x = f(v, w)` into the FusionGroup.
This PR lets the graph fuser move `x = f(v, w)` into the FusionGroup.
It does this by abstracting the broadcast_tensors + multiple chunk nodes
into one intermediate `prim::BroadcastingChunk[chunks, dim]` node.
A `BroadcastingChunk[chunks, dim](*inputs)` node is equivalent to:
- broadcasting all of *inputs
- chunk-ing each broadcasted input into `chunks` chunks along dim `dim`.
Abstracting the broadcasting chunk behavior away, it is now a lot easier
for the graph fuser to move (broadcast + chunk) past an operation. After
this PR, the above graph becomes:
```
x = f(v, w)
ax, bx, ay, by = BroadcastingChunk(x, y)
a = g(ax, ay)
b = g(bx, by)
c = h(a, b)
```
Now, to move `x = f(v, w)` after the BroadcastingChunk, one just needs
to add f's operands to the BroadcastingChunk:
```
ay, by, av, bv, aw, bw = BroadcastingChunk(y, v, w)
ax = f(av, aw)
by = f(bv, bw)
a = g(ax, ay)
b = g(bx, by)
c = h(a, b)
```
cc apaszke mruberry zdevito
Pull Request resolved: https://github.com/pytorch/pytorch/pull/14055
Differential Revision: D13159259
Pulled By: zou3519
fbshipit-source-id: 134e9e645c950384d9be6a06a883a10e17a73d7d
2018-11-26 20:28:44 +00:00
|
|
|
bchunk->owningGraph()->insertNode(chunked_op);
|
Refactor ir.h to distinguish Nodes and Values
This commit adds a Value type similar to the one @ezyang suggested a while
ago for handling multi-return nodes.
Previously if we had a graph like:
a = op1(b)
c, d = op2(a)
Then its in-memory format would look like:
%0 = op1(b)
%1 = op2(%0)
%2 = select(%1, 0)
%2 = select(%1, 1)
Select nodes were used only to handle the multi-output case. In the
single-output case ops referred directly to their uses.
This required special handling for the single- and multi- output cases,
and was confusing when used with ONNX which distinguishes values (the
inputs/outputs of a node) from the nodes themselves (e.g. a Conv).
This commit adds the Node/Value distinction to the IR. In the example
above, `a`, `b`, `c`, and `d` are now Value objects, while `op1` and
`op2` are now Node objects. Inputs/Outputs to the graph are values.
* Nodes now always have multiple outputs, accessible through their `output()`
method.
* Methods exist for adding/removing outputs from a node.
* Nodes own their output Values, destroying a node destroys its outputs and it
is only valid to destroy a node when no uses of its outputs remain.
* Unlike select, Values do not appear in the nodes list.
* The method `node()` on `Value` retrieves its defining node. Calling it
is always valid. For inputs, its kind is "Param". Like "Return" there is a single Param
node representing all inputs.
* For single-output Nodes, the method `output()` retrieves the single
output Value, asserting that the node is in-fact single output.
* Functions are the same, but some functions like `type()` have moved to
Value.
* `replaceAllUsesWith` is now sanely defined for both Values and Nodes.
In the case of Nodes, it replaces all outputs of the node with the outputs
of the replacement node.
* stage is defined both on Node/Value. This is because Inputs require a stage.
* Apart from changing data types from Node->Value most passes remain the same.
Things that previously assumed single-output nodes now have to call output()
to get the node.
* This removes the uses = [...] field in the outputs because it was
getting confusing even before this commit when uses would refer to nodes,
but we print the names of Values. The lint pass validates the use list,
so printing it out seems less necessary.
2017-11-14 11:05:32 +00:00
|
|
|
chunk_sel->replaceAllUsesWith(chunked_op->output());
|
2017-08-01 04:46:41 +00:00
|
|
|
}
|
Allow graph fuser to move chunks past multiple nodes. (#14055)
Summary:
Fixes #12290. Also speeds up JIT LSTM forward pass from 8.8ms to 7.8ms; previously, each JIT lstm cell used 2 fused kernels. Now, it only uses one fused kernel (which is how many kernels cudnn uses).
Explanation:
Let f, g, h be fusible ops.
```
x = f(v, w)
z = g(x, y)
a, b = chunk(z)
c = h(a, b)
```
becomes (before this PR):
```
x = f(v, w)
x', y' = broadcast_tensors([x, y])
ax, bx = chunk(x')
ay, by = chunk(y')
a = g(ax, ay)
b = g(bx, by)
c = h(a, b)
```
The graph fuser then puts g, g, and h into one FusionGroup and is unable
to move `x = f(v, w)` into the FusionGroup.
This PR lets the graph fuser move `x = f(v, w)` into the FusionGroup.
It does this by abstracting the broadcast_tensors + multiple chunk nodes
into one intermediate `prim::BroadcastingChunk[chunks, dim]` node.
A `BroadcastingChunk[chunks, dim](*inputs)` node is equivalent to:
- broadcasting all of *inputs
- chunk-ing each broadcasted input into `chunks` chunks along dim `dim`.
Abstracting the broadcasting chunk behavior away, it is now a lot easier
for the graph fuser to move (broadcast + chunk) past an operation. After
this PR, the above graph becomes:
```
x = f(v, w)
ax, bx, ay, by = BroadcastingChunk(x, y)
a = g(ax, ay)
b = g(bx, by)
c = h(a, b)
```
Now, to move `x = f(v, w)` after the BroadcastingChunk, one just needs
to add f's operands to the BroadcastingChunk:
```
ay, by, av, bv, aw, bw = BroadcastingChunk(y, v, w)
ax = f(av, aw)
by = f(bv, bw)
a = g(ax, ay)
b = g(bx, by)
c = h(a, b)
```
cc apaszke mruberry zdevito
Pull Request resolved: https://github.com/pytorch/pytorch/pull/14055
Differential Revision: D13159259
Pulled By: zou3519
fbshipit-source-id: 134e9e645c950384d9be6a06a883a10e17a73d7d
2018-11-26 20:28:44 +00:00
|
|
|
|
|
|
|
|
bchunk->removeInput(producer_index);
|
|
|
|
|
for (size_t i = 0; i < nchunks; i++) {
|
|
|
|
|
bchunk->eraseOutput(nchunks * producer_index);
|
|
|
|
|
}
|
2018-12-04 23:40:41 +00:00
|
|
|
|
2018-12-26 14:52:25 +00:00
|
|
|
// The output of producer_for_chunk_node could have been used in some
|
|
|
|
|
// aten::size operators, so we need to clean those up as well (we simply
|
|
|
|
|
// broadcast all its tensor inputs).
|
2019-05-24 18:13:07 +00:00
|
|
|
// We need to insert these early in the graph, i.e. immediately after
|
|
|
|
|
// the producer_for_chunk_node as we will have the _size_if_not_same
|
|
|
|
|
// that may be before the bchunk.
|
|
|
|
|
WithInsertPoint guard2(producer_for_chunk_node);
|
2018-12-04 23:40:41 +00:00
|
|
|
auto size_calc_uses = producer_for_chunk_node->output()->uses();
|
|
|
|
|
if (!size_calc_uses.empty()) {
|
2018-12-26 14:52:25 +00:00
|
|
|
auto tensor_inputs = filter(
|
|
|
|
|
producer_for_chunk_node->inputs(),
|
Rename DynamicType -> TensorType (#16787)
Summary:
```
import json
from subprocess import check_call
from pprint import pprint
renames = {
'c10::TensorType': 'DimentionedTensorType',
'c10::DynamicType': 'TensorType',
'c10::TensorTypePtr': 'DimentionedTensorTypePtr',
'c10::DynamicTypePtr': 'TensorTypePtr',
'c10::TypeKind::DynamicType': 'TensorType',
'c10::TypeKind::TensorType': 'DimentionedTensorType',
}
entries = json.loads(open('compile_commands.json', 'r').read())
build = None
sources = []
for e in entries:
name = e['file']
if not ('jit' in name or 'ATen/core' in name):
continue
build = e['directory']
sources.append(name)
args = ['clang-rename', '-i', '-force', '-pl']
for name in sorted(renames.keys()):
args += ['-qualified-name={}'.format(name), '-new-name={}'.format(renames[name])]
for source in sources:
cmd = args + [source]
pprint(args)
check_call(cmd, cwd=build)
check_call(['git', 'stash', 'push', '-m', 'rename'])
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/16787
Differential Revision: D13974132
Pulled By: zdevito
fbshipit-source-id: 8368fd53e17cff83707bbe77f2d7aad74f8ce60e
2019-02-07 01:22:47 +00:00
|
|
|
[](Value* v) { return v->type()->isSubtypeOf(TensorType::get()); });
|
2018-12-26 14:52:25 +00:00
|
|
|
auto tensor_sizes = fmap(tensor_inputs, [](Value* v) {
|
|
|
|
|
return v->owningGraph()->insert(aten::size, {v});
|
|
|
|
|
});
|
2019-01-24 19:05:07 +00:00
|
|
|
AT_ASSERT(!tensor_sizes.empty());
|
2018-12-26 14:52:25 +00:00
|
|
|
Value* output_size = tensor_sizes.size() == 1
|
|
|
|
|
? tensor_sizes[0]
|
|
|
|
|
: broadcastSizes(tensor_sizes);
|
2018-12-04 23:40:41 +00:00
|
|
|
for (Use u : size_calc_uses) {
|
|
|
|
|
u.user->output()->replaceAllUsesWith(output_size);
|
|
|
|
|
u.user->destroy();
|
|
|
|
|
}
|
|
|
|
|
}
|
Refactor ir.h to distinguish Nodes and Values
This commit adds a Value type similar to the one @ezyang suggested a while
ago for handling multi-return nodes.
Previously if we had a graph like:
a = op1(b)
c, d = op2(a)
Then its in-memory format would look like:
%0 = op1(b)
%1 = op2(%0)
%2 = select(%1, 0)
%2 = select(%1, 1)
Select nodes were used only to handle the multi-output case. In the
single-output case ops referred directly to their uses.
This required special handling for the single- and multi- output cases,
and was confusing when used with ONNX which distinguishes values (the
inputs/outputs of a node) from the nodes themselves (e.g. a Conv).
This commit adds the Node/Value distinction to the IR. In the example
above, `a`, `b`, `c`, and `d` are now Value objects, while `op1` and
`op2` are now Node objects. Inputs/Outputs to the graph are values.
* Nodes now always have multiple outputs, accessible through their `output()`
method.
* Methods exist for adding/removing outputs from a node.
* Nodes own their output Values, destroying a node destroys its outputs and it
is only valid to destroy a node when no uses of its outputs remain.
* Unlike select, Values do not appear in the nodes list.
* The method `node()` on `Value` retrieves its defining node. Calling it
is always valid. For inputs, its kind is "Param". Like "Return" there is a single Param
node representing all inputs.
* For single-output Nodes, the method `output()` retrieves the single
output Value, asserting that the node is in-fact single output.
* Functions are the same, but some functions like `type()` have moved to
Value.
* `replaceAllUsesWith` is now sanely defined for both Values and Nodes.
In the case of Nodes, it replaces all outputs of the node with the outputs
of the replacement node.
* stage is defined both on Node/Value. This is because Inputs require a stage.
* Apart from changing data types from Node->Value most passes remain the same.
Things that previously assumed single-output nodes now have to call output()
to get the node.
* This removes the uses = [...] field in the outputs because it was
getting confusing even before this commit when uses would refer to nodes,
but we print the names of Values. The lint pass validates the use list,
so printing it out seems less necessary.
2017-11-14 11:05:32 +00:00
|
|
|
producer_for_chunk_node->destroy();
|
2017-08-01 04:46:41 +00:00
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
2017-11-26 20:26:45 +00:00
|
|
|
// returns where to continue scanning, and whether any fusion was made
|
2019-01-10 20:25:22 +00:00
|
|
|
std::pair<graph_node_list::iterator, bool> scanNode(Node* consumer) {
|
2019-01-11 18:17:54 +00:00
|
|
|
if (isFusable(consumer)) {
|
2017-07-21 07:03:45 +00:00
|
|
|
// handle inputs in reverse topological order as well...
|
|
|
|
|
// otherwise in f(a,a+b) it will appear a is used twice if we consider
|
|
|
|
|
// the f-a fusion before the f-(a+b) fusion first.
|
2019-01-11 18:17:54 +00:00
|
|
|
auto inputs = sortReverseTopological(consumer->inputs());
|
2018-12-26 14:52:25 +00:00
|
|
|
for (auto producer : inputs) {
|
|
|
|
|
if (tryToMoveChunk(consumer, producer)) {
|
2017-08-01 04:46:41 +00:00
|
|
|
// the chunk before this consumer was re-arranged to allow fusion,
|
|
|
|
|
// we scan this consumer again to perform the fusion
|
2017-11-26 20:26:45 +00:00
|
|
|
return std::make_pair(consumer->reverseIterator(), true);
|
2017-08-01 04:46:41 +00:00
|
|
|
}
|
2019-01-10 20:25:22 +00:00
|
|
|
auto fusion_group = tryFuse(consumer, producer);
|
2018-10-30 18:08:30 +00:00
|
|
|
if (fusion_group) {
|
|
|
|
|
// after fusion, consumer moves into a FusionGroup, so inputs is no
|
|
|
|
|
// longer valid so we rescan the new FusionGroup for more fusions...
|
|
|
|
|
return std::make_pair(fusion_group.value()->reverseIterator(), true);
|
2017-07-21 07:03:45 +00:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
2017-11-26 20:26:45 +00:00
|
|
|
return std::make_pair(++consumer->reverseIterator(), false);
|
2017-07-21 07:03:45 +00:00
|
|
|
}
|
|
|
|
|
|
Allow graph fuser to move chunks past multiple nodes. (#14055)
Summary:
Fixes #12290. Also speeds up JIT LSTM forward pass from 8.8ms to 7.8ms; previously, each JIT lstm cell used 2 fused kernels. Now, it only uses one fused kernel (which is how many kernels cudnn uses).
Explanation:
Let f, g, h be fusible ops.
```
x = f(v, w)
z = g(x, y)
a, b = chunk(z)
c = h(a, b)
```
becomes (before this PR):
```
x = f(v, w)
x', y' = broadcast_tensors([x, y])
ax, bx = chunk(x')
ay, by = chunk(y')
a = g(ax, ay)
b = g(bx, by)
c = h(a, b)
```
The graph fuser then puts g, g, and h into one FusionGroup and is unable
to move `x = f(v, w)` into the FusionGroup.
This PR lets the graph fuser move `x = f(v, w)` into the FusionGroup.
It does this by abstracting the broadcast_tensors + multiple chunk nodes
into one intermediate `prim::BroadcastingChunk[chunks, dim]` node.
A `BroadcastingChunk[chunks, dim](*inputs)` node is equivalent to:
- broadcasting all of *inputs
- chunk-ing each broadcasted input into `chunks` chunks along dim `dim`.
Abstracting the broadcasting chunk behavior away, it is now a lot easier
for the graph fuser to move (broadcast + chunk) past an operation. After
this PR, the above graph becomes:
```
x = f(v, w)
ax, bx, ay, by = BroadcastingChunk(x, y)
a = g(ax, ay)
b = g(bx, by)
c = h(a, b)
```
Now, to move `x = f(v, w)` after the BroadcastingChunk, one just needs
to add f's operands to the BroadcastingChunk:
```
ay, by, av, bv, aw, bw = BroadcastingChunk(y, v, w)
ax = f(av, aw)
by = f(bv, bw)
a = g(ax, ay)
b = g(bx, by)
c = h(a, b)
```
cc apaszke mruberry zdevito
Pull Request resolved: https://github.com/pytorch/pytorch/pull/14055
Differential Revision: D13159259
Pulled By: zou3519
fbshipit-source-id: 134e9e645c950384d9be6a06a883a10e17a73d7d
2018-11-26 20:28:44 +00:00
|
|
|
void replaceIntermediateBroadcastingChunks() {
|
|
|
|
|
for (auto it = block_->nodes().rbegin(); it != block_->nodes().rend();) {
|
2018-12-26 14:52:25 +00:00
|
|
|
auto* node = *it;
|
|
|
|
|
++it; // We might delete node, so increment the iterator now.
|
Allow graph fuser to move chunks past multiple nodes. (#14055)
Summary:
Fixes #12290. Also speeds up JIT LSTM forward pass from 8.8ms to 7.8ms; previously, each JIT lstm cell used 2 fused kernels. Now, it only uses one fused kernel (which is how many kernels cudnn uses).
Explanation:
Let f, g, h be fusible ops.
```
x = f(v, w)
z = g(x, y)
a, b = chunk(z)
c = h(a, b)
```
becomes (before this PR):
```
x = f(v, w)
x', y' = broadcast_tensors([x, y])
ax, bx = chunk(x')
ay, by = chunk(y')
a = g(ax, ay)
b = g(bx, by)
c = h(a, b)
```
The graph fuser then puts g, g, and h into one FusionGroup and is unable
to move `x = f(v, w)` into the FusionGroup.
This PR lets the graph fuser move `x = f(v, w)` into the FusionGroup.
It does this by abstracting the broadcast_tensors + multiple chunk nodes
into one intermediate `prim::BroadcastingChunk[chunks, dim]` node.
A `BroadcastingChunk[chunks, dim](*inputs)` node is equivalent to:
- broadcasting all of *inputs
- chunk-ing each broadcasted input into `chunks` chunks along dim `dim`.
Abstracting the broadcasting chunk behavior away, it is now a lot easier
for the graph fuser to move (broadcast + chunk) past an operation. After
this PR, the above graph becomes:
```
x = f(v, w)
ax, bx, ay, by = BroadcastingChunk(x, y)
a = g(ax, ay)
b = g(bx, by)
c = h(a, b)
```
Now, to move `x = f(v, w)` after the BroadcastingChunk, one just needs
to add f's operands to the BroadcastingChunk:
```
ay, by, av, bv, aw, bw = BroadcastingChunk(y, v, w)
ax = f(av, aw)
by = f(bv, bw)
a = g(ax, ay)
b = g(bx, by)
c = h(a, b)
```
cc apaszke mruberry zdevito
Pull Request resolved: https://github.com/pytorch/pytorch/pull/14055
Differential Revision: D13159259
Pulled By: zou3519
fbshipit-source-id: 134e9e645c950384d9be6a06a883a10e17a73d7d
2018-11-26 20:28:44 +00:00
|
|
|
if (node->kind() != prim::BroadcastingChunk) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
2018-12-26 14:52:25 +00:00
|
|
|
auto* bchunk = node;
|
Allow graph fuser to move chunks past multiple nodes. (#14055)
Summary:
Fixes #12290. Also speeds up JIT LSTM forward pass from 8.8ms to 7.8ms; previously, each JIT lstm cell used 2 fused kernels. Now, it only uses one fused kernel (which is how many kernels cudnn uses).
Explanation:
Let f, g, h be fusible ops.
```
x = f(v, w)
z = g(x, y)
a, b = chunk(z)
c = h(a, b)
```
becomes (before this PR):
```
x = f(v, w)
x', y' = broadcast_tensors([x, y])
ax, bx = chunk(x')
ay, by = chunk(y')
a = g(ax, ay)
b = g(bx, by)
c = h(a, b)
```
The graph fuser then puts g, g, and h into one FusionGroup and is unable
to move `x = f(v, w)` into the FusionGroup.
This PR lets the graph fuser move `x = f(v, w)` into the FusionGroup.
It does this by abstracting the broadcast_tensors + multiple chunk nodes
into one intermediate `prim::BroadcastingChunk[chunks, dim]` node.
A `BroadcastingChunk[chunks, dim](*inputs)` node is equivalent to:
- broadcasting all of *inputs
- chunk-ing each broadcasted input into `chunks` chunks along dim `dim`.
Abstracting the broadcasting chunk behavior away, it is now a lot easier
for the graph fuser to move (broadcast + chunk) past an operation. After
this PR, the above graph becomes:
```
x = f(v, w)
ax, bx, ay, by = BroadcastingChunk(x, y)
a = g(ax, ay)
b = g(bx, by)
c = h(a, b)
```
Now, to move `x = f(v, w)` after the BroadcastingChunk, one just needs
to add f's operands to the BroadcastingChunk:
```
ay, by, av, bv, aw, bw = BroadcastingChunk(y, v, w)
ax = f(av, aw)
by = f(bv, bw)
a = g(ax, ay)
b = g(bx, by)
c = h(a, b)
```
cc apaszke mruberry zdevito
Pull Request resolved: https://github.com/pytorch/pytorch/pull/14055
Differential Revision: D13159259
Pulled By: zou3519
fbshipit-source-id: 134e9e645c950384d9be6a06a883a10e17a73d7d
2018-11-26 20:28:44 +00:00
|
|
|
insertExplicitBroadcast(bchunk);
|
|
|
|
|
|
2018-12-26 14:52:25 +00:00
|
|
|
auto* graph = block_->owningGraph();
|
Allow graph fuser to move chunks past multiple nodes. (#14055)
Summary:
Fixes #12290. Also speeds up JIT LSTM forward pass from 8.8ms to 7.8ms; previously, each JIT lstm cell used 2 fused kernels. Now, it only uses one fused kernel (which is how many kernels cudnn uses).
Explanation:
Let f, g, h be fusible ops.
```
x = f(v, w)
z = g(x, y)
a, b = chunk(z)
c = h(a, b)
```
becomes (before this PR):
```
x = f(v, w)
x', y' = broadcast_tensors([x, y])
ax, bx = chunk(x')
ay, by = chunk(y')
a = g(ax, ay)
b = g(bx, by)
c = h(a, b)
```
The graph fuser then puts g, g, and h into one FusionGroup and is unable
to move `x = f(v, w)` into the FusionGroup.
This PR lets the graph fuser move `x = f(v, w)` into the FusionGroup.
It does this by abstracting the broadcast_tensors + multiple chunk nodes
into one intermediate `prim::BroadcastingChunk[chunks, dim]` node.
A `BroadcastingChunk[chunks, dim](*inputs)` node is equivalent to:
- broadcasting all of *inputs
- chunk-ing each broadcasted input into `chunks` chunks along dim `dim`.
Abstracting the broadcasting chunk behavior away, it is now a lot easier
for the graph fuser to move (broadcast + chunk) past an operation. After
this PR, the above graph becomes:
```
x = f(v, w)
ax, bx, ay, by = BroadcastingChunk(x, y)
a = g(ax, ay)
b = g(bx, by)
c = h(a, b)
```
Now, to move `x = f(v, w)` after the BroadcastingChunk, one just needs
to add f's operands to the BroadcastingChunk:
```
ay, by, av, bv, aw, bw = BroadcastingChunk(y, v, w)
ax = f(av, aw)
by = f(bv, bw)
a = g(ax, ay)
b = g(bx, by)
c = h(a, b)
```
cc apaszke mruberry zdevito
Pull Request resolved: https://github.com/pytorch/pytorch/pull/14055
Differential Revision: D13159259
Pulled By: zou3519
fbshipit-source-id: 134e9e645c950384d9be6a06a883a10e17a73d7d
2018-11-26 20:28:44 +00:00
|
|
|
size_t nchunks = bchunk->i(attr::chunks);
|
|
|
|
|
WithInsertPoint guard(bchunk->next());
|
|
|
|
|
|
|
|
|
|
// Split the bchunk into bchunks.inputs().size() number of chunk nodes.
|
2018-12-26 14:52:25 +00:00
|
|
|
for (size_t input_offset = 0; input_offset < bchunk->inputs().size();
|
|
|
|
|
input_offset++) {
|
Allow graph fuser to move chunks past multiple nodes. (#14055)
Summary:
Fixes #12290. Also speeds up JIT LSTM forward pass from 8.8ms to 7.8ms; previously, each JIT lstm cell used 2 fused kernels. Now, it only uses one fused kernel (which is how many kernels cudnn uses).
Explanation:
Let f, g, h be fusible ops.
```
x = f(v, w)
z = g(x, y)
a, b = chunk(z)
c = h(a, b)
```
becomes (before this PR):
```
x = f(v, w)
x', y' = broadcast_tensors([x, y])
ax, bx = chunk(x')
ay, by = chunk(y')
a = g(ax, ay)
b = g(bx, by)
c = h(a, b)
```
The graph fuser then puts g, g, and h into one FusionGroup and is unable
to move `x = f(v, w)` into the FusionGroup.
This PR lets the graph fuser move `x = f(v, w)` into the FusionGroup.
It does this by abstracting the broadcast_tensors + multiple chunk nodes
into one intermediate `prim::BroadcastingChunk[chunks, dim]` node.
A `BroadcastingChunk[chunks, dim](*inputs)` node is equivalent to:
- broadcasting all of *inputs
- chunk-ing each broadcasted input into `chunks` chunks along dim `dim`.
Abstracting the broadcasting chunk behavior away, it is now a lot easier
for the graph fuser to move (broadcast + chunk) past an operation. After
this PR, the above graph becomes:
```
x = f(v, w)
ax, bx, ay, by = BroadcastingChunk(x, y)
a = g(ax, ay)
b = g(bx, by)
c = h(a, b)
```
Now, to move `x = f(v, w)` after the BroadcastingChunk, one just needs
to add f's operands to the BroadcastingChunk:
```
ay, by, av, bv, aw, bw = BroadcastingChunk(y, v, w)
ax = f(av, aw)
by = f(bv, bw)
a = g(ax, ay)
b = g(bx, by)
c = h(a, b)
```
cc apaszke mruberry zdevito
Pull Request resolved: https://github.com/pytorch/pytorch/pull/14055
Differential Revision: D13159259
Pulled By: zou3519
fbshipit-source-id: 134e9e645c950384d9be6a06a883a10e17a73d7d
2018-11-26 20:28:44 +00:00
|
|
|
auto* input = bchunk->inputs().at(input_offset);
|
|
|
|
|
|
2018-12-26 14:52:25 +00:00
|
|
|
Node* new_chunk =
|
|
|
|
|
graph->insertNode(graph->create(prim::ConstantChunk, input, 0));
|
Allow graph fuser to move chunks past multiple nodes. (#14055)
Summary:
Fixes #12290. Also speeds up JIT LSTM forward pass from 8.8ms to 7.8ms; previously, each JIT lstm cell used 2 fused kernels. Now, it only uses one fused kernel (which is how many kernels cudnn uses).
Explanation:
Let f, g, h be fusible ops.
```
x = f(v, w)
z = g(x, y)
a, b = chunk(z)
c = h(a, b)
```
becomes (before this PR):
```
x = f(v, w)
x', y' = broadcast_tensors([x, y])
ax, bx = chunk(x')
ay, by = chunk(y')
a = g(ax, ay)
b = g(bx, by)
c = h(a, b)
```
The graph fuser then puts g, g, and h into one FusionGroup and is unable
to move `x = f(v, w)` into the FusionGroup.
This PR lets the graph fuser move `x = f(v, w)` into the FusionGroup.
It does this by abstracting the broadcast_tensors + multiple chunk nodes
into one intermediate `prim::BroadcastingChunk[chunks, dim]` node.
A `BroadcastingChunk[chunks, dim](*inputs)` node is equivalent to:
- broadcasting all of *inputs
- chunk-ing each broadcasted input into `chunks` chunks along dim `dim`.
Abstracting the broadcasting chunk behavior away, it is now a lot easier
for the graph fuser to move (broadcast + chunk) past an operation. After
this PR, the above graph becomes:
```
x = f(v, w)
ax, bx, ay, by = BroadcastingChunk(x, y)
a = g(ax, ay)
b = g(bx, by)
c = h(a, b)
```
Now, to move `x = f(v, w)` after the BroadcastingChunk, one just needs
to add f's operands to the BroadcastingChunk:
```
ay, by, av, bv, aw, bw = BroadcastingChunk(y, v, w)
ax = f(av, aw)
by = f(bv, bw)
a = g(ax, ay)
b = g(bx, by)
c = h(a, b)
```
cc apaszke mruberry zdevito
Pull Request resolved: https://github.com/pytorch/pytorch/pull/14055
Differential Revision: D13159259
Pulled By: zou3519
fbshipit-source-id: 134e9e645c950384d9be6a06a883a10e17a73d7d
2018-11-26 20:28:44 +00:00
|
|
|
new_chunk->copyAttributes(*bchunk);
|
2018-12-26 14:52:25 +00:00
|
|
|
for (size_t output_offset = 0; output_offset < nchunks;
|
|
|
|
|
output_offset++) {
|
Allow graph fuser to move chunks past multiple nodes. (#14055)
Summary:
Fixes #12290. Also speeds up JIT LSTM forward pass from 8.8ms to 7.8ms; previously, each JIT lstm cell used 2 fused kernels. Now, it only uses one fused kernel (which is how many kernels cudnn uses).
Explanation:
Let f, g, h be fusible ops.
```
x = f(v, w)
z = g(x, y)
a, b = chunk(z)
c = h(a, b)
```
becomes (before this PR):
```
x = f(v, w)
x', y' = broadcast_tensors([x, y])
ax, bx = chunk(x')
ay, by = chunk(y')
a = g(ax, ay)
b = g(bx, by)
c = h(a, b)
```
The graph fuser then puts g, g, and h into one FusionGroup and is unable
to move `x = f(v, w)` into the FusionGroup.
This PR lets the graph fuser move `x = f(v, w)` into the FusionGroup.
It does this by abstracting the broadcast_tensors + multiple chunk nodes
into one intermediate `prim::BroadcastingChunk[chunks, dim]` node.
A `BroadcastingChunk[chunks, dim](*inputs)` node is equivalent to:
- broadcasting all of *inputs
- chunk-ing each broadcasted input into `chunks` chunks along dim `dim`.
Abstracting the broadcasting chunk behavior away, it is now a lot easier
for the graph fuser to move (broadcast + chunk) past an operation. After
this PR, the above graph becomes:
```
x = f(v, w)
ax, bx, ay, by = BroadcastingChunk(x, y)
a = g(ax, ay)
b = g(bx, by)
c = h(a, b)
```
Now, to move `x = f(v, w)` after the BroadcastingChunk, one just needs
to add f's operands to the BroadcastingChunk:
```
ay, by, av, bv, aw, bw = BroadcastingChunk(y, v, w)
ax = f(av, aw)
by = f(bv, bw)
a = g(ax, ay)
b = g(bx, by)
c = h(a, b)
```
cc apaszke mruberry zdevito
Pull Request resolved: https://github.com/pytorch/pytorch/pull/14055
Differential Revision: D13159259
Pulled By: zou3519
fbshipit-source-id: 134e9e645c950384d9be6a06a883a10e17a73d7d
2018-11-26 20:28:44 +00:00
|
|
|
auto new_output = new_chunk->addOutput();
|
2018-12-26 14:52:25 +00:00
|
|
|
auto old_output =
|
|
|
|
|
bchunk->outputs().at(input_offset * nchunks + output_offset);
|
Allow graph fuser to move chunks past multiple nodes. (#14055)
Summary:
Fixes #12290. Also speeds up JIT LSTM forward pass from 8.8ms to 7.8ms; previously, each JIT lstm cell used 2 fused kernels. Now, it only uses one fused kernel (which is how many kernels cudnn uses).
Explanation:
Let f, g, h be fusible ops.
```
x = f(v, w)
z = g(x, y)
a, b = chunk(z)
c = h(a, b)
```
becomes (before this PR):
```
x = f(v, w)
x', y' = broadcast_tensors([x, y])
ax, bx = chunk(x')
ay, by = chunk(y')
a = g(ax, ay)
b = g(bx, by)
c = h(a, b)
```
The graph fuser then puts g, g, and h into one FusionGroup and is unable
to move `x = f(v, w)` into the FusionGroup.
This PR lets the graph fuser move `x = f(v, w)` into the FusionGroup.
It does this by abstracting the broadcast_tensors + multiple chunk nodes
into one intermediate `prim::BroadcastingChunk[chunks, dim]` node.
A `BroadcastingChunk[chunks, dim](*inputs)` node is equivalent to:
- broadcasting all of *inputs
- chunk-ing each broadcasted input into `chunks` chunks along dim `dim`.
Abstracting the broadcasting chunk behavior away, it is now a lot easier
for the graph fuser to move (broadcast + chunk) past an operation. After
this PR, the above graph becomes:
```
x = f(v, w)
ax, bx, ay, by = BroadcastingChunk(x, y)
a = g(ax, ay)
b = g(bx, by)
c = h(a, b)
```
Now, to move `x = f(v, w)` after the BroadcastingChunk, one just needs
to add f's operands to the BroadcastingChunk:
```
ay, by, av, bv, aw, bw = BroadcastingChunk(y, v, w)
ax = f(av, aw)
by = f(bv, bw)
a = g(ax, ay)
b = g(bx, by)
c = h(a, b)
```
cc apaszke mruberry zdevito
Pull Request resolved: https://github.com/pytorch/pytorch/pull/14055
Differential Revision: D13159259
Pulled By: zou3519
fbshipit-source-id: 134e9e645c950384d9be6a06a883a10e17a73d7d
2018-11-26 20:28:44 +00:00
|
|
|
new_output->copyMetadata(old_output);
|
|
|
|
|
old_output->replaceAllUsesWith(new_output);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
bchunk->destroy();
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2018-12-26 14:52:25 +00:00
|
|
|
bool usedOnlyInSize(Value* v) {
|
|
|
|
|
const auto& uses = v->uses();
|
|
|
|
|
return std::all_of(uses.begin(), uses.end(), [](const Use& u) {
|
|
|
|
|
return u.user->matches("aten::size(Tensor self) -> int[]");
|
|
|
|
|
});
|
2018-12-04 23:40:41 +00:00
|
|
|
}
|
|
|
|
|
|
2018-12-26 14:52:25 +00:00
|
|
|
// Builds up expressions that compute shapes of all intermediates (and
|
|
|
|
|
// outputs) of the fusion group, based on the sizes of inputs. You should run
|
|
|
|
|
// DCE to remove those that you end up not using.
|
|
|
|
|
std::unordered_map<Value*, Value*> buildShapeExpressions(Node* fusion_group) {
|
|
|
|
|
WithInsertPoint insert_guard{fusion_group->next()};
|
2018-12-04 23:40:41 +00:00
|
|
|
std::unordered_map<Value*, Value*> shape_of;
|
|
|
|
|
|
2018-12-26 14:52:25 +00:00
|
|
|
Graph* graph = fusion_group->owningGraph();
|
2018-12-04 23:40:41 +00:00
|
|
|
auto subgraph = fusion_group->g(attr::Subgraph);
|
|
|
|
|
|
|
|
|
|
auto inputs = fusion_group->inputs();
|
|
|
|
|
auto sinputs = subgraph->inputs();
|
2019-01-24 19:05:07 +00:00
|
|
|
AT_ASSERT(inputs.size() == sinputs.size());
|
2018-12-04 23:40:41 +00:00
|
|
|
for (size_t i = 0; i < inputs.size(); ++i) {
|
Rename DynamicType -> TensorType (#16787)
Summary:
```
import json
from subprocess import check_call
from pprint import pprint
renames = {
'c10::TensorType': 'DimentionedTensorType',
'c10::DynamicType': 'TensorType',
'c10::TensorTypePtr': 'DimentionedTensorTypePtr',
'c10::DynamicTypePtr': 'TensorTypePtr',
'c10::TypeKind::DynamicType': 'TensorType',
'c10::TypeKind::TensorType': 'DimentionedTensorType',
}
entries = json.loads(open('compile_commands.json', 'r').read())
build = None
sources = []
for e in entries:
name = e['file']
if not ('jit' in name or 'ATen/core' in name):
continue
build = e['directory']
sources.append(name)
args = ['clang-rename', '-i', '-force', '-pl']
for name in sorted(renames.keys()):
args += ['-qualified-name={}'.format(name), '-new-name={}'.format(renames[name])]
for source in sources:
cmd = args + [source]
pprint(args)
check_call(cmd, cwd=build)
check_call(['git', 'stash', 'push', '-m', 'rename'])
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/16787
Differential Revision: D13974132
Pulled By: zdevito
fbshipit-source-id: 8368fd53e17cff83707bbe77f2d7aad74f8ce60e
2019-02-07 01:22:47 +00:00
|
|
|
if (inputs[i]->type()->isSubtypeOf(TensorType::get())) {
|
Get more fusion after autodiff uses SumToSize (#14957)
Summary:
Here is a fresh attempt at getting some fusion back in autodiff-generated graphs in the presence of SumToSize.
- The sum to size operator is now `aten::_grad_sum_to_size` to allow symbolic script differentiation (and that in turn would need to use this in place of sum_to_size to signal that it strictly operates on gradients). This is also used in the autodiff code, replacing `prim::SumToSize`.
- `_grad_sum_to_size` is now fusable, `cat`s - which are fused afterwards thanks to Adam's simplification of the code - are only fused if there is no `_grad_sum_to_size` in the fusion group.
- I push the `_grad_sum_to_size` out of the the fusion group when compiling and record the desired summations in the KernelSpec. The reasoning is the following:
- As the autodiff is a repeated applicaiton of the chain rule, we always have the pattern `grad_in = mm(A, grad_out)`, with A often diagonal for cases interesting to the fuser, whence it is `grad_in = a * grad_out` (a pointwise multiplication). We know that only `grad_out` may have AutodiffGradSumToSize applied, so we can commute AutodiffGradSumToSize with the `mul` (and `div` and `neg` are of similar origin).
- For `type_as` the gradient might be giving the type, so just skip SumToSize,
- `add` (which was inserted as `prim::AutogradAdd`) adding gradients when the forward used the same value in several places. This is non-broadcasting, so we know that the two arguments would have the same sizes as inputs - which is good so we don't have to do bookkeeping of the two parts.
Details:
- During fusion, the Tensor arguments are always kept as the first parameters of the fusion group to accomodate indexing assumptions in the fuser.
- The rewriting of the fusion group to record the necessary output transformation and eliminate `_grad_sum_to_size` from the fusion group is now in the fuser compile step.
- In the execution step, the arguments are split into Tensor / Non-Tensor and the non-tensor args are mostly forgotten about except for doing `sum_to_size` at the end. This would want to be improved if/when we fuse nonconstant scalar arguments.
- In a number of places in the fuser, the non-Tensor arguments to the fusion group needed to be ignored.
Thank you, apaszke for the insightful discussion. All bad ideas and errors are my own.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/14957
Differential Revision: D13888173
Pulled By: zou3519
fbshipit-source-id: 071992c876e8b845f2b3e6329ae03a835d39a0ea
2019-01-31 19:57:56 +00:00
|
|
|
shape_of[sinputs[i]] = graph->insert(aten::size, {inputs[i]});
|
|
|
|
|
}
|
2018-12-04 23:40:41 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// When we have a guarantee that an output won't be removed, because it's
|
|
|
|
|
// used in expressions that don't involve size checks, we can use its size
|
2018-12-26 14:52:25 +00:00
|
|
|
// instead of computing a long chain of broadcasts, starting from the
|
|
|
|
|
// beginning of the kernel.
|
2018-12-04 23:40:41 +00:00
|
|
|
auto outputs = fusion_group->outputs();
|
|
|
|
|
auto soutputs = subgraph->outputs();
|
2019-01-24 19:05:07 +00:00
|
|
|
AT_ASSERT(outputs.size() == soutputs.size());
|
2018-12-04 23:40:41 +00:00
|
|
|
for (size_t i = 0; i < outputs.size(); ++i) {
|
2018-12-26 14:52:25 +00:00
|
|
|
if (usedOnlyInSize(outputs[i]))
|
|
|
|
|
continue;
|
2018-12-04 23:40:41 +00:00
|
|
|
shape_of[soutputs[i]] = graph->insert(aten::size, {outputs[i]});
|
|
|
|
|
}
|
|
|
|
|
|
2018-12-26 14:52:25 +00:00
|
|
|
for (Node* n : subgraph->nodes()) {
|
|
|
|
|
// XXX: Use of shape_of.emplace is crucial to the output shape
|
|
|
|
|
// optimization!
|
2018-12-04 23:40:41 +00:00
|
|
|
if (n->kind() == prim::FusedConcat) {
|
|
|
|
|
// This is a bit more involved, because we have to account for the case
|
|
|
|
|
// when inputs have different shapes, but fortunately those tensors are
|
|
|
|
|
// always outputs, and so we can simply avoid replacing their queries,
|
|
|
|
|
// because it won't help us.
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
if (n->kind() == prim::Constant) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
if (n->kind() == prim::ConstantChunk) {
|
2018-12-26 14:52:25 +00:00
|
|
|
Node* sizes_node = graph->insertNode(
|
|
|
|
|
graph->create(prim::ChunkSizes, shape_of.at(n->input()), 2));
|
2018-12-04 23:40:41 +00:00
|
|
|
sizes_node->i_(attr::dim, n->i(attr::dim));
|
|
|
|
|
sizes_node->i_(attr::chunks, n->i(attr::chunks));
|
2018-12-26 14:52:25 +00:00
|
|
|
Value* regular_size = sizes_node->outputs().at(0);
|
|
|
|
|
Value* last_size = sizes_node->outputs().at(1);
|
2018-12-05 04:35:51 +00:00
|
|
|
regular_size->setType(ListType::ofInts());
|
|
|
|
|
last_size->setType(ListType::ofInts());
|
2018-12-04 23:40:41 +00:00
|
|
|
auto outputs = n->outputs();
|
2018-12-26 14:52:25 +00:00
|
|
|
for (Value* o : outputs.slice(0, outputs.size() - 1)) {
|
2018-12-04 23:40:41 +00:00
|
|
|
shape_of.emplace(o, regular_size);
|
|
|
|
|
}
|
|
|
|
|
shape_of.emplace(outputs.at(outputs.size() - 1), last_size);
|
|
|
|
|
continue;
|
|
|
|
|
}
|
2018-12-26 14:52:25 +00:00
|
|
|
auto tensor_inputs = filter(n->inputs(), [](Value* v) {
|
Rename DynamicType -> TensorType (#16787)
Summary:
```
import json
from subprocess import check_call
from pprint import pprint
renames = {
'c10::TensorType': 'DimentionedTensorType',
'c10::DynamicType': 'TensorType',
'c10::TensorTypePtr': 'DimentionedTensorTypePtr',
'c10::DynamicTypePtr': 'TensorTypePtr',
'c10::TypeKind::DynamicType': 'TensorType',
'c10::TypeKind::TensorType': 'DimentionedTensorType',
}
entries = json.loads(open('compile_commands.json', 'r').read())
build = None
sources = []
for e in entries:
name = e['file']
if not ('jit' in name or 'ATen/core' in name):
continue
build = e['directory']
sources.append(name)
args = ['clang-rename', '-i', '-force', '-pl']
for name in sorted(renames.keys()):
args += ['-qualified-name={}'.format(name), '-new-name={}'.format(renames[name])]
for source in sources:
cmd = args + [source]
pprint(args)
check_call(cmd, cwd=build)
check_call(['git', 'stash', 'push', '-m', 'rename'])
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/16787
Differential Revision: D13974132
Pulled By: zdevito
fbshipit-source-id: 8368fd53e17cff83707bbe77f2d7aad74f8ce60e
2019-02-07 01:22:47 +00:00
|
|
|
return v->type()->isSubtypeOf(TensorType::get());
|
2018-12-26 14:52:25 +00:00
|
|
|
});
|
|
|
|
|
auto shapes =
|
|
|
|
|
fmap(tensor_inputs, [&](Value* v) { return shape_of.at(v); });
|
2019-01-24 19:05:07 +00:00
|
|
|
AT_ASSERT(!shapes.empty());
|
2018-12-26 14:52:25 +00:00
|
|
|
shape_of.emplace(
|
|
|
|
|
n->output(), shapes.size() == 1 ? shapes[0] : broadcastSizes(shapes));
|
2018-12-04 23:40:41 +00:00
|
|
|
}
|
|
|
|
|
return shape_of;
|
|
|
|
|
}
|
|
|
|
|
|
2018-12-26 14:52:25 +00:00
|
|
|
void removeOutputsUsedOnlyInSize(Node* fusion_group) {
|
|
|
|
|
if (fusion_group->kind() != prim::FusionGroup)
|
|
|
|
|
return;
|
2018-12-04 23:40:41 +00:00
|
|
|
auto subgraph = fusion_group->g(attr::Subgraph);
|
|
|
|
|
|
|
|
|
|
auto shape_of = buildShapeExpressions(fusion_group);
|
|
|
|
|
auto outputs = fusion_group->outputs().vec();
|
|
|
|
|
auto soutputs = subgraph->outputs().vec();
|
|
|
|
|
// XXX: Iterating in this order is not only good for performance reasons!
|
|
|
|
|
// It is also crucial for correctness (i has to reflect the current true
|
|
|
|
|
// index of outputs[i])!
|
|
|
|
|
for (int64_t i = static_cast<int64_t>(outputs.size()) - 1; i >= 0; --i) {
|
|
|
|
|
auto output = outputs[i];
|
|
|
|
|
auto soutput = soutputs[i];
|
|
|
|
|
if (usedOnlyInSize(output) && shape_of.count(soutput) > 0) {
|
|
|
|
|
auto uses = output->uses();
|
|
|
|
|
for (Use u : uses) {
|
2019-01-24 19:05:07 +00:00
|
|
|
AT_ASSERT(u.user->matches("aten::size(Tensor self) -> int[]"));
|
2018-12-04 23:40:41 +00:00
|
|
|
u.user->output()->replaceAllUsesWith(shape_of.at(soutput));
|
|
|
|
|
u.user->destroy();
|
|
|
|
|
}
|
|
|
|
|
fusion_group->eraseOutput(i);
|
|
|
|
|
subgraph->eraseOutput(i);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2019-01-10 20:25:22 +00:00
|
|
|
void refreshAliasDb() {
|
2019-01-30 19:06:32 +00:00
|
|
|
aliasDb_ = torch::make_unique<AliasDb>(graph_);
|
2019-01-10 20:25:22 +00:00
|
|
|
}
|
|
|
|
|
|
2019-01-11 18:17:54 +00:00
|
|
|
bool canFuseWithConcat(Value* producer, Node* before_check) {
|
2019-05-07 06:11:58 +00:00
|
|
|
if (!isFusable(producer->node())) {
|
2019-01-11 18:17:54 +00:00
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
// NB: it is important that this check happens after isFusable, which checks
|
|
|
|
|
// that the blocks match, and it's not a special node like prim::Param
|
2019-01-12 04:04:14 +00:00
|
|
|
if (!aliasDb_->couldMoveBeforeTopologically(
|
|
|
|
|
producer->node(), before_check)) {
|
2019-01-11 18:17:54 +00:00
|
|
|
return false;
|
|
|
|
|
}
|
2019-04-10 05:29:33 +00:00
|
|
|
|
|
|
|
|
// If the number of kernel args could exceed the limit, skip.
|
2019-05-06 19:33:45 +00:00
|
|
|
if ((before_check->inputs().size() + before_check->outputs().size() +
|
2019-04-10 05:29:33 +00:00
|
|
|
producer->node()->inputs().size() +
|
2019-07-11 20:26:39 +00:00
|
|
|
producer->node()->outputs().size()) > subgraph_arg_limit_) {
|
2019-04-10 05:29:33 +00:00
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
2019-01-11 18:17:54 +00:00
|
|
|
// Fusion groups can be merged with concat's group if and only if
|
2019-07-25 15:51:47 +00:00
|
|
|
// the value they produce isn't already coming from a concat
|
2019-01-11 18:17:54 +00:00
|
|
|
if (producer->node()->kind() == prim::FusionGroup) {
|
|
|
|
|
auto subgraph = producer->node()->g(attr::Subgraph);
|
|
|
|
|
auto* node = subgraph->outputs().at(producer->offset())->node();
|
2019-07-25 15:51:47 +00:00
|
|
|
return node->kind() != prim::FusedConcat;
|
2019-01-11 18:17:54 +00:00
|
|
|
}
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
Node* createFusedConcat(Node* node) {
|
2019-01-24 19:05:07 +00:00
|
|
|
AT_ASSERT(node->kind() == aten::cat);
|
2019-01-11 18:17:54 +00:00
|
|
|
|
|
|
|
|
Graph* graph = node->owningGraph();
|
|
|
|
|
Node* list_construct = node->namedInput(attr::tensors)->node();
|
|
|
|
|
int64_t dim = node->get<int64_t>(attr::dim).value();
|
|
|
|
|
|
|
|
|
|
Node* fused_cat = graph->create(prim::FusedConcat, list_construct->inputs())
|
|
|
|
|
->i_(attr::dim, dim);
|
|
|
|
|
fused_cat->insertBefore(list_construct);
|
|
|
|
|
fused_cat->output()->copyMetadata(node->output());
|
|
|
|
|
|
|
|
|
|
// NB: this deletes the fused_cat node from the original graph
|
|
|
|
|
return createSingletonFusionGroup(fused_cat);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void fuseConcats() {
|
|
|
|
|
for (auto it = block_->nodes().rbegin(); it != block_->nodes().rend();
|
|
|
|
|
++it) {
|
|
|
|
|
Node* cat = *it;
|
|
|
|
|
if (!isFusableCatNode(cat)) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
Node* list_construct = cat->namedInput(attr::tensors)->node();
|
|
|
|
|
Node* fused_cat = createFusedConcat(cat);
|
|
|
|
|
Value* fused_cat_out = fused_cat->output();
|
|
|
|
|
|
|
|
|
|
auto sorted_inputs = sortReverseTopological(fused_cat->inputs());
|
|
|
|
|
size_t input_idx = 0;
|
|
|
|
|
bool any_fused = false;
|
|
|
|
|
while (input_idx < sorted_inputs.size()) {
|
|
|
|
|
Value* input = sorted_inputs[input_idx++];
|
|
|
|
|
if (!canFuseWithConcat(input, fused_cat)) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
any_fused = true;
|
|
|
|
|
auto maybe_group = tryFuse(fused_cat, input);
|
2019-01-24 19:05:07 +00:00
|
|
|
AT_ASSERT(maybe_group && maybe_group == fused_cat);
|
2019-01-11 18:17:54 +00:00
|
|
|
// We could have destroyed multiple inputs when performing this fusion,
|
|
|
|
|
// so we have to recompute the list and iterate over it again.
|
|
|
|
|
sorted_inputs = sortReverseTopological(fused_cat->inputs());
|
|
|
|
|
input_idx = 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (any_fused) {
|
|
|
|
|
cat->output()->replaceAllUsesWith(fused_cat_out);
|
|
|
|
|
it.destroyCurrent();
|
|
|
|
|
if (list_construct->output()->uses().empty()) {
|
|
|
|
|
list_construct->destroy();
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
fused_cat->destroy();
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void optimizeFusedGraphs() {
|
|
|
|
|
for (Node* node : block_->nodes()) {
|
|
|
|
|
if (node->kind() != prim::FusionGroup) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
auto subgraph = node->g(attr::Subgraph);
|
|
|
|
|
EliminateDeadCode(subgraph);
|
|
|
|
|
EliminateCommonSubexpression(subgraph);
|
|
|
|
|
ConstantPooling(subgraph);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2017-07-31 21:25:40 +00:00
|
|
|
void run() {
|
2017-11-26 20:26:45 +00:00
|
|
|
// Run the pass until no changes are made.
|
|
|
|
|
// This is neccessary, because the algorithm can miss out on certain fusion
|
|
|
|
|
// opportunities if ran only once. Consider this graph:
|
|
|
|
|
//
|
|
|
|
|
// %1 = f(...)
|
|
|
|
|
// %2 = g(%1)
|
|
|
|
|
// %3 = h(%1)
|
|
|
|
|
// %4 = l(%3)
|
|
|
|
|
// return (%4, %2)
|
|
|
|
|
//
|
|
|
|
|
// where f, g, h, l are simple map ops.
|
|
|
|
|
// The first iteration will fuse %4 and %3, and see that %1 is an input, but
|
|
|
|
|
// can't be fused, because it has a different use before the fusion group
|
2018-12-26 14:52:25 +00:00
|
|
|
// in our topological ordering. Then, %2 will be considered, and fused with
|
|
|
|
|
// %1. If we do another iteration, the algorithm will consider the fusion of
|
|
|
|
|
// these two groups and fix the situation.
|
2017-11-26 20:26:45 +00:00
|
|
|
bool any_changed = true;
|
|
|
|
|
while (any_changed) {
|
|
|
|
|
any_changed = false;
|
2019-01-10 20:25:22 +00:00
|
|
|
refreshAliasDb();
|
2018-11-22 01:46:46 +00:00
|
|
|
for (auto it = block_->nodes().rbegin(); it != block_->nodes().rend();) {
|
2017-11-26 20:26:45 +00:00
|
|
|
bool changed;
|
2019-01-10 20:25:22 +00:00
|
|
|
std::tie(it, changed) = scanNode(*it);
|
2017-11-26 20:26:45 +00:00
|
|
|
any_changed |= changed;
|
|
|
|
|
}
|
2017-07-21 07:03:45 +00:00
|
|
|
}
|
2019-01-11 18:17:54 +00:00
|
|
|
refreshAliasDb();
|
|
|
|
|
|
|
|
|
|
fuseConcats();
|
|
|
|
|
|
|
|
|
|
optimizeFusedGraphs();
|
Allow graph fuser to move chunks past multiple nodes. (#14055)
Summary:
Fixes #12290. Also speeds up JIT LSTM forward pass from 8.8ms to 7.8ms; previously, each JIT lstm cell used 2 fused kernels. Now, it only uses one fused kernel (which is how many kernels cudnn uses).
Explanation:
Let f, g, h be fusible ops.
```
x = f(v, w)
z = g(x, y)
a, b = chunk(z)
c = h(a, b)
```
becomes (before this PR):
```
x = f(v, w)
x', y' = broadcast_tensors([x, y])
ax, bx = chunk(x')
ay, by = chunk(y')
a = g(ax, ay)
b = g(bx, by)
c = h(a, b)
```
The graph fuser then puts g, g, and h into one FusionGroup and is unable
to move `x = f(v, w)` into the FusionGroup.
This PR lets the graph fuser move `x = f(v, w)` into the FusionGroup.
It does this by abstracting the broadcast_tensors + multiple chunk nodes
into one intermediate `prim::BroadcastingChunk[chunks, dim]` node.
A `BroadcastingChunk[chunks, dim](*inputs)` node is equivalent to:
- broadcasting all of *inputs
- chunk-ing each broadcasted input into `chunks` chunks along dim `dim`.
Abstracting the broadcasting chunk behavior away, it is now a lot easier
for the graph fuser to move (broadcast + chunk) past an operation. After
this PR, the above graph becomes:
```
x = f(v, w)
ax, bx, ay, by = BroadcastingChunk(x, y)
a = g(ax, ay)
b = g(bx, by)
c = h(a, b)
```
Now, to move `x = f(v, w)` after the BroadcastingChunk, one just needs
to add f's operands to the BroadcastingChunk:
```
ay, by, av, bv, aw, bw = BroadcastingChunk(y, v, w)
ax = f(av, aw)
by = f(bv, bw)
a = g(ax, ay)
b = g(bx, by)
c = h(a, b)
```
cc apaszke mruberry zdevito
Pull Request resolved: https://github.com/pytorch/pytorch/pull/14055
Differential Revision: D13159259
Pulled By: zou3519
fbshipit-source-id: 134e9e645c950384d9be6a06a883a10e17a73d7d
2018-11-26 20:28:44 +00:00
|
|
|
|
|
|
|
|
// The graph fuser can add intermediate prim::BroadcastingChunk nodes.
|
|
|
|
|
// Replace them with broadcasts + chunks.
|
|
|
|
|
replaceIntermediateBroadcastingChunks();
|
|
|
|
|
|
Move at::chunk into the graph fuser (#10178)
Summary:
... to avoid slow at::chunk (it is slow due to tensor initialization). Picking up from #10026
This is done through the following:
1) Absorb starting chunks into FusionGroup as a part of the graph fuser
pass.
2) When compiling a kernel, emit a `std::vector<ConcatDesc>` that describes if an input (of the original graph) will be chunked.
3) When launching a kernel, `use std::vector<ConcatDesc>` to chunk an
input tensor on the CPU. This chunk directly takes in an at::Tensor and creates
four TensorInfo structs in-place in the argument list, bypassing the creation of intermediate Tensors.
- Expect test and correctness test to see if a single chunk is fused
by the graph fuser
- Correctness test for a variety of chunks (dimension = beginning,
middle, end) and tensors (contiguous, non-contiguous, edge case
(splitSize = 1) for both CPU/CUDA
- Expect test for multiple chunks fused into the same kernel and
correctness test.
cc zdevito apaszke
LSTM forward pass, 1 layer, 512 hidden size and input size, 100 seq length, requires_grad=False on all inputs and weights.
After changes:
```
thnn cudnn jit
8.8468 6.5797 9.3470
```
Before changes:
```
thnn cudnn jit
9.9221 6.6539 11.2550
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/10178
Differential Revision: D9382661
Pulled By: zou3519
fbshipit-source-id: 1f8a749208fbdd45559775ce98cf4eb9558448f8
2018-08-18 22:54:25 +00:00
|
|
|
// Fuse starting chunks into the group.
|
2018-11-22 01:46:46 +00:00
|
|
|
for (auto it = block_->nodes().rbegin(); it != block_->nodes().rend();) {
|
Move at::chunk into the graph fuser (#10178)
Summary:
... to avoid slow at::chunk (it is slow due to tensor initialization). Picking up from #10026
This is done through the following:
1) Absorb starting chunks into FusionGroup as a part of the graph fuser
pass.
2) When compiling a kernel, emit a `std::vector<ConcatDesc>` that describes if an input (of the original graph) will be chunked.
3) When launching a kernel, `use std::vector<ConcatDesc>` to chunk an
input tensor on the CPU. This chunk directly takes in an at::Tensor and creates
four TensorInfo structs in-place in the argument list, bypassing the creation of intermediate Tensors.
- Expect test and correctness test to see if a single chunk is fused
by the graph fuser
- Correctness test for a variety of chunks (dimension = beginning,
middle, end) and tensors (contiguous, non-contiguous, edge case
(splitSize = 1) for both CPU/CUDA
- Expect test for multiple chunks fused into the same kernel and
correctness test.
cc zdevito apaszke
LSTM forward pass, 1 layer, 512 hidden size and input size, 100 seq length, requires_grad=False on all inputs and weights.
After changes:
```
thnn cudnn jit
8.8468 6.5797 9.3470
```
Before changes:
```
thnn cudnn jit
9.9221 6.6539 11.2550
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/10178
Differential Revision: D9382661
Pulled By: zou3519
fbshipit-source-id: 1f8a749208fbdd45559775ce98cf4eb9558448f8
2018-08-18 22:54:25 +00:00
|
|
|
it = scanNodeForChunks(*it);
|
|
|
|
|
}
|
2018-12-04 23:40:41 +00:00
|
|
|
|
|
|
|
|
// Remove outputs that have been added only because we need their size
|
2018-12-26 14:52:25 +00:00
|
|
|
for (Node* n : block_->nodes()) {
|
2018-12-04 23:40:41 +00:00
|
|
|
removeOutputsUsedOnlyInSize(n);
|
|
|
|
|
}
|
|
|
|
|
|
2018-12-26 14:52:25 +00:00
|
|
|
for (Node* node : block_->nodes()) {
|
|
|
|
|
for (Block* sub_block : node->blocks()) {
|
2019-07-25 12:50:43 +00:00
|
|
|
GraphFuser(sub_block, graph_, callback_, kind_).run();
|
2018-02-23 01:32:31 +00:00
|
|
|
}
|
|
|
|
|
}
|
2017-07-19 23:10:13 +00:00
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
|
2018-12-26 14:52:25 +00:00
|
|
|
void PeepholeOptimizeShapeExpressions(Block* block) {
|
2018-12-04 23:40:41 +00:00
|
|
|
auto nodes = block->nodes();
|
|
|
|
|
for (auto it = nodes.begin(); it != nodes.end(); ++it) {
|
2018-12-26 14:52:25 +00:00
|
|
|
Node* node = *it;
|
|
|
|
|
for (Block* subblock : node->blocks()) {
|
2018-12-04 23:40:41 +00:00
|
|
|
PeepholeOptimizeShapeExpressions(subblock);
|
|
|
|
|
}
|
|
|
|
|
if (node->kind() == prim::BroadcastSizes) {
|
|
|
|
|
// Remove no-op broadcasts.
|
|
|
|
|
if (node->inputs().size() == 1) {
|
|
|
|
|
node->output()->replaceAllUsesWith(node->input());
|
|
|
|
|
it.destroyCurrent();
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
// Deduplicate inputs, but use their unique() values to ensure
|
|
|
|
|
// this process only depends on the graph.
|
|
|
|
|
std::map<size_t, Value*> unique_to_value;
|
2018-12-26 14:52:25 +00:00
|
|
|
for (Value* input : node->inputs()) {
|
2018-12-04 23:40:41 +00:00
|
|
|
unique_to_value.emplace(input->unique(), input);
|
|
|
|
|
}
|
|
|
|
|
if (unique_to_value.size() != node->inputs().size()) {
|
|
|
|
|
std::vector<Value*> inputs;
|
2018-12-14 21:30:35 +00:00
|
|
|
inputs.reserve(unique_to_value.size());
|
2018-12-26 14:52:25 +00:00
|
|
|
for (auto& entry : unique_to_value) {
|
2018-12-04 23:40:41 +00:00
|
|
|
inputs.push_back(entry.second);
|
|
|
|
|
}
|
|
|
|
|
if (inputs.size() == 1) {
|
|
|
|
|
node->output()->replaceAllUsesWith(inputs[0]);
|
|
|
|
|
} else {
|
2018-12-26 14:52:25 +00:00
|
|
|
WithInsertPoint insert_guard{node};
|
2018-12-04 23:40:41 +00:00
|
|
|
node->output()->replaceAllUsesWith(broadcastSizes(inputs));
|
|
|
|
|
}
|
|
|
|
|
it.destroyCurrent();
|
|
|
|
|
--it; // Revisit the node with deduplicated inputs
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
// Remove compose simple chains of broadcasts into a single node.
|
2018-12-26 14:52:25 +00:00
|
|
|
const auto& uses = node->output()->uses();
|
2018-12-04 23:40:41 +00:00
|
|
|
if (uses.size() == 1 && uses[0].user->kind() == prim::BroadcastSizes) {
|
2018-12-26 14:52:25 +00:00
|
|
|
Node* user = uses[0].user;
|
2018-12-04 23:40:41 +00:00
|
|
|
user->removeInput(uses[0].offset);
|
2018-12-26 14:52:25 +00:00
|
|
|
// NB: we don't care about deduplication in here, as we will visit user
|
|
|
|
|
// later.
|
|
|
|
|
for (Value* i : node->inputs()) {
|
2018-12-04 23:40:41 +00:00
|
|
|
user->addInput(i);
|
|
|
|
|
}
|
|
|
|
|
it.destroyCurrent();
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2017-09-12 14:31:49 +00:00
|
|
|
} // anonymous namespace
|
|
|
|
|
|
2017-08-18 23:56:34 +00:00
|
|
|
void FuseGraph(std::shared_ptr<Graph>& graph) {
|
2019-04-22 23:54:19 +00:00
|
|
|
GraphFuser(graph->block(), graph).run();
|
|
|
|
|
// After FuseGraph some common subexpressions may come back
|
|
|
|
|
EliminateCommonSubexpression(graph);
|
|
|
|
|
// We might have emitted a fair amount of useless shape propagating code, so
|
|
|
|
|
// remove it
|
|
|
|
|
EliminateDeadCode(graph);
|
|
|
|
|
// Improve the quality of shape propagation code that was left
|
|
|
|
|
PeepholeOptimizeShapeExpressions(graph->block());
|
2017-07-19 23:10:13 +00:00
|
|
|
}
|
|
|
|
|
|
2019-05-07 06:11:58 +00:00
|
|
|
void CustomFuseGraph(
|
|
|
|
|
std::shared_ptr<Graph>& graph,
|
|
|
|
|
std::function<bool(Node*)> fn,
|
2019-07-11 20:26:39 +00:00
|
|
|
Symbol kind,
|
|
|
|
|
size_t arg_limit) {
|
|
|
|
|
auto g = GraphFuser(
|
2019-05-07 06:11:58 +00:00
|
|
|
graph->block(),
|
|
|
|
|
graph,
|
|
|
|
|
[=](Node* n) { return fn(n) || n->kind() == kind; },
|
2019-07-11 20:26:39 +00:00
|
|
|
kind);
|
|
|
|
|
g.setInputArgLimit(arg_limit);
|
|
|
|
|
g.run();
|
2019-05-07 06:11:58 +00:00
|
|
|
}
|
|
|
|
|
|
2018-12-26 14:52:25 +00:00
|
|
|
} // namespace jit
|
|
|
|
|
} // namespace torch
|